diff options
367 files changed, 28279 insertions, 11556 deletions
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com> Jacky Chen <jackychen@google.com> Jim Bankoski <jimbankoski@google.com> Johann Koenig <johannkoenig@google.com> +Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com> Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org> Johann <johann@duck.com> <johann.koenig@gmail.com> @@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> Yaowu Xu <yaowu@google.com> <Yaowu Xu> Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com> Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org> -xiwei gu <guxiwei-hf@loongson.cn> +Xiwei Gu <guxiwei-hf@loongson.cn> @@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com> Angie Chen <yunqi@google.com> Angie Chiang <angiebird@google.com> Anton Venema <anton.venema@liveswitch.com> +Anupam Pandey <anupam.pandey@ittiam.com> Aron Rosenberg <arosenberg@logitech.com> Attila Nagy <attilanagy@google.com> Birk Magnussen <birk.magnussen@googlemail.com> @@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org> changjun.yang <changjun.yang@intel.com> Charles 'Buck' Krasic <ckrasic@google.com> Cheng Chen <chengchen@google.com> +Chen Wang <wangchen20@iscas.ac.cn> +Cherma Rajan A <cherma.rajan@ittiam.com> Chi Yo Tsai <chiyotsai@google.com> chm <chm@rock-chips.com> Chris Cunningham <chcunningham@chromium.org> @@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com> Fyodor Kyslov <kyslov@google.com> Gabriel Marin <gmx@chromium.org> Gaute Strokkenes <gaute.strokkenes@broadcom.com> +George Steed <george.steed@arm.com> +Gerda Zsejke More <gerdazsejke.more@arm.com> Geza Lore <gezalore@gmail.com> Ghislain MARY <ghislainmary2@gmail.com> Giuseppe Scrivano <gscrivano@gnu.org> @@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn> Jingning Han <jingning@google.com> Joel Fernandes <joelaf@google.com> Joey Parrish <joeyparrish@google.com> +Johann <johann@duck.com> Johann Koenig <johannkoenig@google.com> John Koleszar <jkoleszar@google.com> Johnny Klonaris <google@jawknee.com> @@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net> Konstantinos Margaritis <konma@vectorcamp.gr> Kyle Siefring <kylesiefring@gmail.com> Lawrence Velázquez <larryv@macports.org> +L. E. Segovia <amy@amyspark.me> Linfeng Zhang <linfengz@google.com> Liu Peng <pengliu.mail@gmail.com> Lou Quillio <louquillio@google.com> @@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com> Moriyoshi Koizumi <mozo@mozo.jp> Morton Jonuschat <yabawock@gmail.com> Nathan E. Egge <negge@mozilla.com> +Neeraj Gadgil <neeraj.gadgil@ittiam.com> Neil Birkbeck <neil.birkbeck@gmail.com> Nico Weber <thakis@chromium.org> Niveditha Rau <niveditha.rau@gmail.com> @@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org> Vlad Tsyrklevich <vtsyrklevich@chromium.org> Wan-Teh Chang <wtc@google.com> Wonkap Jang <wonkap@google.com> -xiwei gu <guxiwei-hf@loongson.cn> +Xiahong Bao <xiahong.bao@nxp.com> +Xiwei Gu <guxiwei-hf@loongson.cn> Yaowu Xu <yaowu@google.com> Yi Luo <luoyi@google.com> Yongzhe Wang <yongzhe@google.com> diff --git a/Android.bp b/Android.bp index 952765b90..b770ff9a0 100644 --- a/Android.bp +++ b/Android.bp @@ -111,7 +111,9 @@ libvpx_arm_neon_c_srcs = [ "vp9/decoder/vp9_job_queue.c", "vp9/encoder/arm/neon/vp9_dct_neon.c", "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c", + "vp9/encoder/arm/neon/vp9_error_neon.c", "vp9/encoder/arm/neon/vp9_frame_scale_neon.c", + "vp9/encoder/arm/neon/vp9_highbd_error_neon.c", "vp9/encoder/arm/neon/vp9_quantize_neon.c", "vp9/encoder/vp9_aq_cyclicrefresh.c", "vp9/encoder/vp9_bitstream.c", @@ -143,6 +145,7 @@ libvpx_arm_neon_c_srcs = [ "vp9/encoder/vp9_subexp.c", "vp9/encoder/vp9_svc_layercontext.c", "vp9/encoder/vp9_tokenize.c", + "vp9/encoder/vp9_tpl_model.c", "vp9/encoder/vp9_treewriter.c", "vp9/vp9_cx_iface.c", "vp9/vp9_dx_iface.c", @@ -151,6 +154,7 @@ libvpx_arm_neon_c_srcs = [ "vpx/src/vpx_decoder.c", "vpx/src/vpx_encoder.c", "vpx/src/vpx_image.c", + "vpx/src/vpx_tpl.c", "vpx_dsp/arm/avg_neon.c", "vpx_dsp/arm/avg_pred_neon.c", "vpx_dsp/arm/fdct4x4_neon.c", @@ -159,6 +163,9 @@ libvpx_arm_neon_c_srcs = [ "vpx_dsp/arm/fdct32x32_neon.c", "vpx_dsp/arm/fdct_partial_neon.c", "vpx_dsp/arm/hadamard_neon.c", + "vpx_dsp/arm/highbd_avg_neon.c", + "vpx_dsp/arm/highbd_avg_pred_neon.c", + "vpx_dsp/arm/highbd_hadamard_neon.c", "vpx_dsp/arm/highbd_idct4x4_add_neon.c", "vpx_dsp/arm/highbd_idct8x8_add_neon.c", "vpx_dsp/arm/highbd_idct16x16_add_neon.c", @@ -169,7 +176,10 @@ libvpx_arm_neon_c_srcs = [ "vpx_dsp/arm/highbd_intrapred_neon.c", "vpx_dsp/arm/highbd_loopfilter_neon.c", "vpx_dsp/arm/highbd_quantize_neon.c", + "vpx_dsp/arm/highbd_sad4d_neon.c", "vpx_dsp/arm/highbd_sad_neon.c", + "vpx_dsp/arm/highbd_sse_neon.c", + "vpx_dsp/arm/highbd_subpel_variance_neon.c", "vpx_dsp/arm/highbd_variance_neon.c", "vpx_dsp/arm/highbd_vpx_convolve8_neon.c", "vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", @@ -187,6 +197,7 @@ libvpx_arm_neon_c_srcs = [ "vpx_dsp/arm/quantize_neon.c", "vpx_dsp/arm/sad4d_neon.c", "vpx_dsp/arm/sad_neon.c", + "vpx_dsp/arm/sse_neon.c", "vpx_dsp/arm/subpel_variance_neon.c", "vpx_dsp/arm/subtract_neon.c", "vpx_dsp/arm/sum_squares_neon.c", @@ -208,13 +219,14 @@ libvpx_arm_neon_c_srcs = [ "vpx_dsp/quantize.c", "vpx_dsp/sad.c", "vpx_dsp/skin_detection.c", + "vpx_dsp/sse.c", "vpx_dsp/subtract.c", "vpx_dsp/sum_squares.c", "vpx_dsp/variance.c", "vpx_dsp/vpx_convolve.c", "vpx_dsp/vpx_dsp_rtcd.c", "vpx_mem/vpx_mem.c", - "vpx_ports/arm_cpudetect.c", + "vpx_ports/aarch32_cpudetect.c", "vpx_scale/generic/gen_scalers.c", "vpx_scale/generic/vpx_scale.c", "vpx_scale/generic/yv12config.c", @@ -355,7 +367,9 @@ libvpx_arm64_c_srcs = [ "vp9/decoder/vp9_job_queue.c", "vp9/encoder/arm/neon/vp9_dct_neon.c", "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c", + "vp9/encoder/arm/neon/vp9_error_neon.c", "vp9/encoder/arm/neon/vp9_frame_scale_neon.c", + "vp9/encoder/arm/neon/vp9_highbd_error_neon.c", "vp9/encoder/arm/neon/vp9_quantize_neon.c", "vp9/encoder/vp9_aq_cyclicrefresh.c", "vp9/encoder/vp9_bitstream.c", @@ -387,6 +401,7 @@ libvpx_arm64_c_srcs = [ "vp9/encoder/vp9_subexp.c", "vp9/encoder/vp9_svc_layercontext.c", "vp9/encoder/vp9_tokenize.c", + "vp9/encoder/vp9_tpl_model.c", "vp9/encoder/vp9_treewriter.c", "vp9/vp9_cx_iface.c", "vp9/vp9_dx_iface.c", @@ -395,6 +410,7 @@ libvpx_arm64_c_srcs = [ "vpx/src/vpx_decoder.c", "vpx/src/vpx_encoder.c", "vpx/src/vpx_image.c", + "vpx/src/vpx_tpl.c", "vpx_dsp/arm/avg_neon.c", "vpx_dsp/arm/avg_pred_neon.c", "vpx_dsp/arm/fdct4x4_neon.c", @@ -403,6 +419,9 @@ libvpx_arm64_c_srcs = [ "vpx_dsp/arm/fdct32x32_neon.c", "vpx_dsp/arm/fdct_partial_neon.c", "vpx_dsp/arm/hadamard_neon.c", + "vpx_dsp/arm/highbd_avg_neon.c", + "vpx_dsp/arm/highbd_avg_pred_neon.c", + "vpx_dsp/arm/highbd_hadamard_neon.c", "vpx_dsp/arm/highbd_idct4x4_add_neon.c", "vpx_dsp/arm/highbd_idct8x8_add_neon.c", "vpx_dsp/arm/highbd_idct16x16_add_neon.c", @@ -413,7 +432,10 @@ libvpx_arm64_c_srcs = [ "vpx_dsp/arm/highbd_intrapred_neon.c", "vpx_dsp/arm/highbd_loopfilter_neon.c", "vpx_dsp/arm/highbd_quantize_neon.c", + "vpx_dsp/arm/highbd_sad4d_neon.c", "vpx_dsp/arm/highbd_sad_neon.c", + "vpx_dsp/arm/highbd_sse_neon.c", + "vpx_dsp/arm/highbd_subpel_variance_neon.c", "vpx_dsp/arm/highbd_variance_neon.c", "vpx_dsp/arm/highbd_vpx_convolve8_neon.c", "vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", @@ -434,6 +456,7 @@ libvpx_arm64_c_srcs = [ "vpx_dsp/arm/quantize_neon.c", "vpx_dsp/arm/sad4d_neon.c", "vpx_dsp/arm/sad_neon.c", + "vpx_dsp/arm/sse_neon.c", "vpx_dsp/arm/subpel_variance_neon.c", "vpx_dsp/arm/subtract_neon.c", "vpx_dsp/arm/sum_squares_neon.c", @@ -457,13 +480,14 @@ libvpx_arm64_c_srcs = [ "vpx_dsp/quantize.c", "vpx_dsp/sad.c", "vpx_dsp/skin_detection.c", + "vpx_dsp/sse.c", "vpx_dsp/subtract.c", "vpx_dsp/sum_squares.c", "vpx_dsp/variance.c", "vpx_dsp/vpx_convolve.c", "vpx_dsp/vpx_dsp_rtcd.c", "vpx_mem/vpx_mem.c", - "vpx_ports/arm_cpudetect.c", + "vpx_ports/aarch64_cpudetect.c", "vpx_scale/generic/gen_scalers.c", "vpx_scale/generic/vpx_scale.c", "vpx_scale/generic/yv12config.c", @@ -587,6 +611,7 @@ libvpx_generic_c_srcs = [ "vp9/encoder/vp9_subexp.c", "vp9/encoder/vp9_svc_layercontext.c", "vp9/encoder/vp9_tokenize.c", + "vp9/encoder/vp9_tpl_model.c", "vp9/encoder/vp9_treewriter.c", "vp9/vp9_cx_iface.c", "vp9/vp9_dx_iface.c", @@ -595,6 +620,7 @@ libvpx_generic_c_srcs = [ "vpx/src/vpx_decoder.c", "vpx/src/vpx_encoder.c", "vpx/src/vpx_image.c", + "vpx/src/vpx_tpl.c", "vpx_dsp/avg.c", "vpx_dsp/bitreader.c", "vpx_dsp/bitreader_buffer.c", @@ -609,6 +635,7 @@ libvpx_generic_c_srcs = [ "vpx_dsp/quantize.c", "vpx_dsp/sad.c", "vpx_dsp/skin_detection.c", + "vpx_dsp/sse.c", "vpx_dsp/subtract.c", "vpx_dsp/sum_squares.c", "vpx_dsp/variance.c", @@ -750,6 +777,7 @@ libvpx_x86_c_srcs = [ "vp9/encoder/vp9_subexp.c", "vp9/encoder/vp9_svc_layercontext.c", "vp9/encoder/vp9_tokenize.c", + "vp9/encoder/vp9_tpl_model.c", "vp9/encoder/vp9_treewriter.c", "vp9/encoder/x86/vp9_dct_intrin_sse2.c", "vp9/encoder/x86/vp9_frame_scale_ssse3.c", @@ -763,6 +791,7 @@ libvpx_x86_c_srcs = [ "vpx/src/vpx_decoder.c", "vpx/src/vpx_encoder.c", "vpx/src/vpx_image.c", + "vpx/src/vpx_tpl.c", "vpx_dsp/add_noise.c", "vpx_dsp/avg.c", "vpx_dsp/bitreader.c", @@ -779,6 +808,7 @@ libvpx_x86_c_srcs = [ "vpx_dsp/quantize.c", "vpx_dsp/sad.c", "vpx_dsp/skin_detection.c", + "vpx_dsp/sse.c", "vpx_dsp/subtract.c", "vpx_dsp/sum_squares.c", "vpx_dsp/variance.c", @@ -986,6 +1016,7 @@ libvpx_x86_64_c_srcs = [ "vp9/encoder/vp9_subexp.c", "vp9/encoder/vp9_svc_layercontext.c", "vp9/encoder/vp9_tokenize.c", + "vp9/encoder/vp9_tpl_model.c", "vp9/encoder/vp9_treewriter.c", "vp9/encoder/x86/vp9_dct_intrin_sse2.c", "vp9/encoder/x86/vp9_frame_scale_ssse3.c", @@ -999,6 +1030,7 @@ libvpx_x86_64_c_srcs = [ "vpx/src/vpx_decoder.c", "vpx/src/vpx_encoder.c", "vpx/src/vpx_image.c", + "vpx/src/vpx_tpl.c", "vpx_dsp/add_noise.c", "vpx_dsp/avg.c", "vpx_dsp/bitreader.c", @@ -1015,6 +1047,7 @@ libvpx_x86_64_c_srcs = [ "vpx_dsp/quantize.c", "vpx_dsp/sad.c", "vpx_dsp/skin_detection.c", + "vpx_dsp/sse.c", "vpx_dsp/subtract.c", "vpx_dsp/sum_squares.c", "vpx_dsp/variance.c", @@ -1,3 +1,80 @@ +2024-01-02 v1.14.0 "Venetian Duck" + This release drops support for old C compilers, such as Visual Studio 2012 + and older, that disallow mixing variable declarations and statements (a C99 + feature). It adds support for run-time CPU feature detection for Arm + platforms, as well as support for darwin23 (macOS 14). + + - Upgrading: + This release is ABI incompatible with the previous release. + + Various new features for rate control library for real-time: SVC parallel + encoding, loopfilter level, support for frame dropping, and screen content. + + New callback function send_tpl_gop_stats for vp9 external rate control + library, which can be used to transmit TPL stats for a group of pictures. A + public header vpx_tpl.h is added for the definition of TPL stats used in + this callback. + + libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c. + + - Enhancement: + Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8, + 68%-151% speed up for high bitdepth. + + Improvements on AVX2 and SSE optimizations. + Improvements on LSX optimizations for LoongArch. + 42-49% speedup on speed 0 VoD encoding. + Android API level predicates. + + - Bug fixes: + Fix to missing prototypes from the rtcd header. + Fix to segfault when total size is enlarged but width is smaller. + Fix to the build for arm64ec using MSVC. + Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic. + Fix to -Wshadow warnings. + Fix to heap overflow in vpx_get4x4sse_cs_neon. + Fix to buffer overrun in highbd Neon subpel variance filters. + Added bitexact encode test script. + Fix to -Wl,-z,defs with Clang's sanitizers. + Fix to decoder stability after error & continued decoding. + Fix to mismatch of VP9 encode with NEON intrinsics with C only version. + Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon. + Fix to fragments count before use. + Fix to a case where target bandwidth is 0 for SVC. + Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob. + Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr. + Fix to integer overflow in vp8,ratectrl.c. + Fix to integer overflow in vp9 svc. + Fix to avg_frame_bandwidth overflow. + Fix to per frame qp for temporal layers. + Fix to unsigned integer overflow in sse computation. + Fix to uninitialized mesh feature for BEST mode. + Fix to overflow in highbd temporal_filter. + Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon. + Skip arm64_neon.h workaround w/VS >= 2019. + Fix to c vs avx mismatch of diamond_search_sad(). + Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function. + Fix to a bug in vpx_hadamard_32x32_neon(). + Fix to Clang -Wunreachable-code-aggressive warnings. + Fix to a bug in vpx_highbd_hadamard_32x32_neon(). + Fix to -Wunreachable-code in mfqe_partition. + Force mode search on 64x64 if no mode is selected. + Fix to ubsan failure caused by left shift of negative. + Fix to integer overflow in calc_pframe_target_size. + Fix to float-cast-overflow in vp8_change_config(). + Fix to a null ptr before use. + Conditionally skip using inter frames in speed features. + Remove invalid reference frames. + Disable intra mode search speed features conditionally. + Set nonrd keyframe under dynamic change of deadline for rtc. + Fix to scaled reference offsets. + Set skip_recode=0 in nonrd_pick_sb_modes. + Fix to an edge case when downsizing to one. + Fix to a bug in frame scaling. + Fix to pred buffer stride. + Fix to a bug in simple motion search. + Update frame size in actual encoding. + 2023-09-29 v1.13.1 "Ugly Duckling" This release contains two security related fixes. One each for VP8 and VP9. @@ -64,9 +64,16 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-android-gcc arm64-darwin-gcc arm64-darwin20-gcc + arm64-darwin21-gcc + arm64-darwin22-gcc + arm64-darwin23-gcc arm64-linux-gcc arm64-win64-gcc arm64-win64-vs15 + arm64-win64-vs16 + arm64-win64-vs16-clangcl + arm64-win64-vs17 + arm64-win64-vs17-clangcl armv7-android-gcc armv7-darwin-gcc armv7-linux-rvct @@ -75,8 +82,12 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7-win32-gcc armv7-win32-vs14 armv7-win32-vs15 + armv7-win32-vs16 + armv7-win32-vs17 armv7s-darwin-gcc armv8-linux-gcc + loongarch32-linux-gcc + loongarch64-linux-gcc mips32-linux-gcc mips64-linux-gcc ppc64le-linux-gcc @@ -117,6 +128,9 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin18-gcc x86_64-darwin19-gcc x86_64-darwin20-gcc + x86_64-darwin21-gcc + x86_64-darwin22-gcc + x86_64-darwin23-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc @@ -135,7 +135,6 @@ unsigned int arg_parse_uint(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } int arg_parse_int(const struct arg *arg) { @@ -152,7 +151,6 @@ int arg_parse_int(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } struct vpx_rational { @@ -209,7 +207,6 @@ int arg_parse_enum(const struct arg *arg) { if (!strcmp(arg->val, listptr->name)) return listptr->val; die("Option %s: Invalid value '%s'\n", arg->name, arg->val); - return 0; } int arg_parse_enum_or_int(const struct arg *arg) { diff --git a/build/make/Makefile b/build/make/Makefile index 5c38c18e5..199ed7805 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -143,6 +143,14 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +# AARCH64 +$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve + # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx @@ -304,6 +312,19 @@ $(1): $(qexec)$$(AR) $$(ARFLAGS) $$@ $$^ endef +# Don't use -Wl,-z,defs with Clang's sanitizers. +# +# Clang's AddressSanitizer documentation says "When linking shared libraries, +# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link +# errors (don't use it with AddressSanitizer)." See +# https://clang.llvm.org/docs/AddressSanitizer.html#usage. +NO_UNDEFINED := -Wl,-z,defs +ifeq ($(findstring clang,$(CC)),clang) + ifneq ($(filter -fsanitize=%,$(LDFLAGS)),) + NO_UNDEFINED := + endif +endif + define so_template # Not using a pattern rule here because we don't want to generate empty # archives when they are listed as a dependency in files not responsible @@ -313,7 +334,8 @@ define so_template $(1): $(if $(quiet),@echo " [LD] $$@") $(qexec)$$(LD) -shared $$(LDFLAGS) \ - -Wl,--no-undefined -Wl,-soname,$$(SONAME) \ + $(NO_UNDEFINED) \ + -Wl,-soname,$$(SONAME) \ -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \ $$(filter %.o,$$^) $$(extralibs) endef diff --git a/build/make/configure.sh b/build/make/configure.sh index 4bf090f00..b645a666f 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -521,6 +521,7 @@ AS_SFX = ${AS_SFX:-.asm} EXE_SFX = ${EXE_SFX} VCPROJ_SFX = ${VCPROJ_SFX} RTCD_OPTIONS = ${RTCD_OPTIONS} +LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS} LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS} EOF @@ -791,7 +792,7 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin2[0-2]*) + *darwin2[0-3]*) tgt_isa=`uname -m` tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; @@ -842,6 +843,10 @@ process_common_toolchain() { # Enable the architecture family case ${tgt_isa} in + arm64 | armv8) + enable_feature arm + enable_feature aarch64 + ;; arm*) enable_feature arm ;; @@ -858,8 +863,14 @@ process_common_toolchain() { ;; esac - # PIC is probably what we want when building shared libs + # Position independent code (PIC) is probably what we want when building + # shared libs or position independent executable (PIE) targets. enabled shared && soft_enable pic + check_cpp << EOF || soft_enable pic +#if !(__pie__ || __PIE__) +#error Neither __pie__ or __PIE__ are set +#endif +EOF # Minimum iOS version for all target platforms (darwin and iphonesimulator). # Shared library framework builds are only possible on iOS 8 and later. @@ -940,7 +951,7 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin2[0-2]-*) + *-darwin2[0-3]-*) add_cflags "-arch ${toolchain%%-*}" add_ldflags "-arch ${toolchain%%-*}" ;; @@ -965,13 +976,26 @@ process_common_toolchain() { ;; esac - # Process ARM architecture variants + # Process architecture variants case ${toolchain} in arm*) - # on arm, isa versions are supersets + soft_enable runtime_cpu_detect + # Arm ISA extensions are treated as supersets. case ${tgt_isa} in arm64|armv8) - soft_enable neon + for ext in ${ARCH_EXT_LIST_AARCH64}; do + # Disable higher order extensions to simplify dependencies. + if [ "$disable_exts" = "yes" ]; then + if ! disabled $ext; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + disable_feature $ext + fi + elif disabled $ext; then + disable_exts="yes" + else + soft_enable $ext + fi + done ;; armv7|armv7s) soft_enable neon @@ -1066,8 +1090,11 @@ EOF enable_feature win_arm64_neon_h_workaround else # If a probe is not possible, assume this is the pure Windows - # SDK and so the workaround is necessary. - enable_feature win_arm64_neon_h_workaround + # SDK and so the workaround is necessary when using Visual + # Studio < 2019. + if [ ${tgt_cc##vs} -lt 16 ]; then + enable_feature win_arm64_neon_h_workaround + fi fi fi fi diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index 58bb66b9e..1e1db05bb 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -141,7 +141,17 @@ for opt in "$@"; do case "$opt" in --help|-h) show_help ;; - --target=*) target="${optval}" + --target=*) + target="${optval}" + platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}') + case "$platform_toolset" in + clangcl) platform_toolset="ClangCl" + ;; + "") + ;; + *) die Unrecognized Visual Studio Platform Toolset in $opt + ;; + esac ;; --out=*) outfile="$optval" ;; @@ -259,6 +269,10 @@ case "$target" in ;; arm64*) platforms[0]="ARM64" + # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC. + if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then + platforms[1]="ARM64EC" + fi asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)"" asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)"" ;; @@ -335,17 +349,21 @@ generate_vcxproj() { else tag_content ConfigurationType StaticLibrary fi - if [ "$vs_ver" = "14" ]; then - tag_content PlatformToolset v140 - fi - if [ "$vs_ver" = "15" ]; then - tag_content PlatformToolset v141 - fi - if [ "$vs_ver" = "16" ]; then - tag_content PlatformToolset v142 - fi - if [ "$vs_ver" = "17" ]; then - tag_content PlatformToolset v143 + if [ -n "$platform_toolset" ]; then + tag_content PlatformToolset "$platform_toolset" + else + if [ "$vs_ver" = "14" ]; then + tag_content PlatformToolset v140 + fi + if [ "$vs_ver" = "15" ]; then + tag_content PlatformToolset v141 + fi + if [ "$vs_ver" = "16" ]; then + tag_content PlatformToolset v142 + fi + if [ "$vs_ver" = "17" ]; then + tag_content PlatformToolset v143 + fi fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index f4edeaad5..0b9e16738 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') { @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h index b2b2fc2dc..cb7c1948a 100644 --- a/config/arm-neon/vp9_rtcd.h +++ b/config/arm-neon/vp9_rtcd.h @@ -21,7 +21,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -32,13 +34,15 @@ extern "C" { #endif int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_neon int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_c +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_neon void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -57,17 +61,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); -#define vp9_highbd_block_error vp9_highbd_block_error_c +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_neon void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_neon void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_neon void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c @@ -84,12 +91,12 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int str void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon -void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); @@ -107,12 +114,12 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_neon -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm index 2ccf56fde..6932c6eca 100644 --- a/config/arm-neon/vpx_config.asm +++ b/config/arm-neon/vpx_config.asm @@ -2,13 +2,17 @@ @ using the ads2gas.pl script. .syntax unified .equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 0 .equ VPX_ARCH_MIPS , 0 .equ VPX_ARCH_X86 , 0 .equ VPX_ARCH_X86_64 , 0 .equ VPX_ARCH_PPC , 0 .equ VPX_ARCH_LOONGARCH , 0 -.equ HAVE_NEON , 1 .equ HAVE_NEON_ASM , 1 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 @@ -78,7 +82,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 0 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -91,4 +94,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h index 3fa6606ab..ae183d7d6 100644 --- a/config/arm-neon/vpx_config.h +++ b/config/arm-neon/vpx_config.h @@ -11,13 +11,17 @@ #define RESTRICT #define INLINE inline #define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 0 #define VPX_ARCH_MIPS 0 #define VPX_ARCH_X86 0 #define VPX_ARCH_X86_64 0 #define VPX_ARCH_PPC 0 #define VPX_ARCH_LOONGARCH 0 -#define HAVE_NEON 1 #define HAVE_NEON_ASM 1 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -87,7 +91,6 @@ #define CONFIG_MULTI_RES_ENCODING 0 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -100,6 +103,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 4096 #define DECODE_HEIGHT_LIMIT 3072 #endif /* VPX_CONFIG_H */ diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h index 565105892..578f1c5dc 100644 --- a/config/arm-neon/vpx_dsp_rtcd.h +++ b/config/arm-neon/vpx_dsp_rtcd.h @@ -15,6 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -66,16 +70,20 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *d #define vpx_convolve_copy vpx_convolve_copy_neon void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -94,28 +102,36 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t * #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -137,16 +153,20 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *abo #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c @@ -838,10 +858,12 @@ unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_strid #define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p); -#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c +unsigned int vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_neon unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p); -#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c +unsigned int vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_neon void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); @@ -880,16 +902,20 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, ui #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_neon void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_neon void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_neon void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_neon void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -908,28 +934,36 @@ void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const u #define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_neon void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_neon void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_neon void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_neon void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_neon void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_neon void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_neon void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_neon void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -948,16 +982,20 @@ void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const ui #define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_neon void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_neon void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_neon void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_neon void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -1072,13 +1110,16 @@ void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_c +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_neon void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_c +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_neon void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_c +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_neon void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); @@ -1187,14 +1228,15 @@ void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t * #define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); -#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_neon -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1205,8 +1247,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon -void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1217,8 +1259,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon -void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1229,8 +1271,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon -void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1241,8 +1283,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon -void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1253,8 +1295,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon -void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1265,8 +1307,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon -void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1277,8 +1319,8 @@ unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon -void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1289,8 +1331,8 @@ unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon -void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1301,8 +1343,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon -void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1313,8 +1355,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon -void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1325,8 +1367,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon -void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1337,8 +1379,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon -void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1349,12 +1391,121 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon -void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon +unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_neon + +void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_neon + +unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_neon + +void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_neon + +unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_neon + +void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_neon + +unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_neon + +void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_neon + +unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_neon + +void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_neon + +unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_neon + +void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_neon + +unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_neon + +void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_neon + +unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_neon + +void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_neon + +unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_neon + +void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_neon + +unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_neon + +void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_neon + +unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_neon + +void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_neon + +unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_neon + +void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_neon + +unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_neon + +void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_neon + int vpx_highbd_satd_c(const tran_low_t *coeff, int length); -#define vpx_highbd_satd vpx_highbd_satd_c +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length); +#define vpx_highbd_satd vpx_highbd_satd_neon + +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +int64_t vpx_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +#define vpx_highbd_sse vpx_highbd_sse_neon void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); @@ -1515,20 +1666,23 @@ unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uin #define vpx_mse16x16 vpx_mse16x16_neon unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_c +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_neon unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse8x16 vpx_mse8x16_c +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_neon unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse8x8 vpx_mse8x8_c +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_neon -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_neon -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1539,8 +1693,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_neon -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_neon unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1551,8 +1705,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_neon -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_neon unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1563,8 +1717,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_neon -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_neon unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1575,8 +1729,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_neon -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_neon unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1587,8 +1741,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_neon -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_neon unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1599,8 +1753,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_neon -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_neon unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1611,8 +1765,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_neon -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_neon unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1623,8 +1777,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_neon -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_neon unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1635,8 +1789,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_neon -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_neon unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1647,8 +1801,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_neon -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_neon unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1659,8 +1813,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_neon -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_neon unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1671,8 +1825,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_neon -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_neon unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1683,10 +1837,114 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_neon -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_neon +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_neon + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_neon + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_neon + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_neon + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_neon + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_neon + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_neon + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_neon + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_neon + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_neon + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_neon + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_neon + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_neon + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_neon + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_neon + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_neon + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + int vpx_satd_c(const tran_low_t *coeff, int length); int vpx_satd_neon(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_neon @@ -1710,6 +1968,10 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +#define vpx_sse vpx_sse_neon + uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h index fa2cc50fd..00ab40fe2 100644 --- a/config/arm-neon/vpx_version.h +++ b/config/arm-neon/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 13 +#define VERSION_MINOR 14 #define VERSION_PATCH 0 -#define VERSION_EXTRA "1559-gcd7dbca207" +#define VERSION_EXTRA "1616-g26104bbc9d" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207" -#define VERSION_STRING " v1.13.0-1559-gcd7dbca207" +#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d" +#define VERSION_STRING " v1.14.0-1616-g26104bbc9d" diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h index b2b2fc2dc..cb7c1948a 100644 --- a/config/arm64/vp9_rtcd.h +++ b/config/arm64/vp9_rtcd.h @@ -21,7 +21,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -32,13 +34,15 @@ extern "C" { #endif int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_neon int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_c +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_neon void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -57,17 +61,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); -#define vp9_highbd_block_error vp9_highbd_block_error_c +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_neon void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_neon void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_neon void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c @@ -84,12 +91,12 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int str void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon -void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); @@ -107,12 +114,12 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_neon -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm index c4b840b89..f23e27fe7 100644 --- a/config/arm64/vpx_config.asm +++ b/config/arm64/vpx_config.asm @@ -2,13 +2,17 @@ @ using the ads2gas.pl script. .syntax unified .equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 1 .equ VPX_ARCH_MIPS , 0 .equ VPX_ARCH_X86 , 0 .equ VPX_ARCH_X86_64 , 0 .equ VPX_ARCH_PPC , 0 .equ VPX_ARCH_LOONGARCH , 0 -.equ HAVE_NEON , 1 .equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 @@ -78,7 +82,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 0 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -91,4 +94,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/config/arm64/vpx_config.c b/config/arm64/vpx_config.c index 13490c81c..d9a44071c 100644 --- a/config/arm64/vpx_config.c +++ b/config/arm64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; +static const char* const cfg = "--target=armv8-linux-gcc --disable-neon_dotprod --disable-neon_i8mm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h index 247c0ea6f..03f681712 100644 --- a/config/arm64/vpx_config.h +++ b/config/arm64/vpx_config.h @@ -11,13 +11,17 @@ #define RESTRICT #define INLINE inline #define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 1 #define VPX_ARCH_MIPS 0 #define VPX_ARCH_X86 0 #define VPX_ARCH_X86_64 0 #define VPX_ARCH_PPC 0 #define VPX_ARCH_LOONGARCH 0 -#define HAVE_NEON 1 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -87,7 +91,6 @@ #define CONFIG_MULTI_RES_ENCODING 0 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -100,6 +103,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 4096 #define DECODE_HEIGHT_LIMIT 3072 #endif /* VPX_CONFIG_H */ diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h index 565105892..578f1c5dc 100644 --- a/config/arm64/vpx_dsp_rtcd.h +++ b/config/arm64/vpx_dsp_rtcd.h @@ -15,6 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -66,16 +70,20 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *d #define vpx_convolve_copy vpx_convolve_copy_neon void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -94,28 +102,36 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t * #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -137,16 +153,20 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *abo #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c @@ -838,10 +858,12 @@ unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_strid #define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p); -#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c +unsigned int vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_neon unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p); -#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c +unsigned int vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_neon void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); @@ -880,16 +902,20 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, ui #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_neon void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_neon void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_neon void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_neon void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -908,28 +934,36 @@ void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const u #define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_neon void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_neon void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_neon void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_neon void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_neon void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_neon void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_neon void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_neon void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -948,16 +982,20 @@ void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const ui #define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_neon void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_neon void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_neon void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); -#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_neon void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); @@ -1072,13 +1110,16 @@ void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_c +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_neon void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_c +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_neon void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_c +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_neon void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); @@ -1187,14 +1228,15 @@ void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t * #define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); -#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_neon -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1205,8 +1247,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon -void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1217,8 +1259,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon -void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1229,8 +1271,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon -void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1241,8 +1283,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon -void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1253,8 +1295,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon -void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1265,8 +1307,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon -void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1277,8 +1319,8 @@ unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon -void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1289,8 +1331,8 @@ unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon -void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1301,8 +1343,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon -void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1313,8 +1355,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon -void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1325,8 +1367,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon -void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1337,8 +1379,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon -void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1349,12 +1391,121 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon -void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon +unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_neon + +void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_neon + +unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_neon + +void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_neon + +unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_neon + +void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_neon + +unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_neon + +void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_neon + +unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_neon + +void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_neon + +unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_neon + +void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_neon + +unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_neon + +void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_neon + +unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_neon + +void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_neon + +unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_neon + +void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_neon + +unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_neon + +void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_neon + +unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_neon + +void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_neon + +unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_neon + +void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_neon + +unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_neon + +void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_neon + int vpx_highbd_satd_c(const tran_low_t *coeff, int length); -#define vpx_highbd_satd vpx_highbd_satd_c +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length); +#define vpx_highbd_satd vpx_highbd_satd_neon + +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +int64_t vpx_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +#define vpx_highbd_sse vpx_highbd_sse_neon void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); @@ -1515,20 +1666,23 @@ unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uin #define vpx_mse16x16 vpx_mse16x16_neon unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_c +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_neon unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse8x16 vpx_mse8x16_c +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_neon unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_mse8x8 vpx_mse8x8_c +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_neon -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_neon -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1539,8 +1693,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_neon -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_neon unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1551,8 +1705,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_neon -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_neon unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1563,8 +1717,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_neon -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_neon unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1575,8 +1729,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_neon -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_neon unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1587,8 +1741,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_neon -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_neon unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1599,8 +1753,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_neon -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_neon unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1611,8 +1765,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_neon -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_neon unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1623,8 +1777,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_neon -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_neon unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1635,8 +1789,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_neon -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_neon unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1647,8 +1801,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_neon -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_neon unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1659,8 +1813,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_neon -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_neon unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1671,8 +1825,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_neon -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_neon unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1683,10 +1837,114 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_neon -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_neon +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_neon + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_neon + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_neon + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_neon + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_neon + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_neon + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_neon + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_neon + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_neon + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_neon + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_neon + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_neon + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_neon + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_neon + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_neon + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_neon + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + int vpx_satd_c(const tran_low_t *coeff, int length); int vpx_satd_neon(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_neon @@ -1710,6 +1968,10 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +#define vpx_sse vpx_sse_neon + uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h index fa2cc50fd..00ab40fe2 100644 --- a/config/arm64/vpx_version.h +++ b/config/arm64/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 13 +#define VERSION_MINOR 14 #define VERSION_PATCH 0 -#define VERSION_EXTRA "1559-gcd7dbca207" +#define VERSION_EXTRA "1616-g26104bbc9d" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207" -#define VERSION_STRING " v1.13.0-1559-gcd7dbca207" +#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d" +#define VERSION_STRING " v1.14.0-1616-g26104bbc9d" diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h index 07d24536d..d3379b6dd 100644 --- a/config/generic/vp9_rtcd.h +++ b/config/generic/vp9_rtcd.h @@ -21,7 +21,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -37,7 +39,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -76,10 +78,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c -void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); @@ -94,10 +96,10 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_c -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm index ffeb85ebb..f14048b69 100644 --- a/config/generic/vpx_config.asm +++ b/config/generic/vpx_config.asm @@ -2,13 +2,17 @@ @ using the ads2gas.pl script. .syntax unified .equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 .equ VPX_ARCH_MIPS , 0 .equ VPX_ARCH_X86 , 0 .equ VPX_ARCH_X86_64 , 0 .equ VPX_ARCH_PPC , 0 .equ VPX_ARCH_LOONGARCH , 0 -.equ HAVE_NEON , 0 .equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 @@ -78,7 +82,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 0 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -91,4 +94,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h index c9d8393c8..bade04289 100644 --- a/config/generic/vpx_config.h +++ b/config/generic/vpx_config.h @@ -11,13 +11,17 @@ #define RESTRICT #define INLINE inline #define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 #define VPX_ARCH_MIPS 0 #define VPX_ARCH_X86 0 #define VPX_ARCH_X86_64 0 #define VPX_ARCH_PPC 0 #define VPX_ARCH_LOONGARCH 0 -#define HAVE_NEON 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -87,7 +91,6 @@ #define CONFIG_MULTI_RES_ENCODING 0 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -100,6 +103,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 4096 #define DECODE_HEIGHT_LIMIT 3072 #endif /* VPX_CONFIG_H */ diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h index 328601f76..256cbdfa5 100644 --- a/config/generic/vpx_dsp_rtcd.h +++ b/config/generic/vpx_dsp_rtcd.h @@ -15,6 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -930,10 +934,10 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *bli void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b vpx_highbd_quantize_b_c -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -942,7 +946,7 @@ unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c -void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -951,7 +955,7 @@ unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c -void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -960,7 +964,7 @@ unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c -void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -969,7 +973,7 @@ unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c -void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -978,7 +982,7 @@ unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c -void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -987,7 +991,7 @@ unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c -void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -996,7 +1000,7 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c -void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1005,7 +1009,7 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c -void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1014,7 +1018,7 @@ unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c -void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1023,7 +1027,7 @@ unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c -void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1032,7 +1036,7 @@ unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c -void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1041,7 +1045,7 @@ unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c -void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1050,12 +1054,93 @@ unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c -void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c +unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_c + +void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_c + +unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_c + +void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_c + +unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_c + +void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_c + +unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_c + +void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_c + +unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_c + +void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_c + +unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_c + +void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_c + +unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c + +void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c + +unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c + +void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_c + +unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_c + +void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_c + +unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_c + +void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_c + +unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_c + +void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_c + +unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c + +void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c + +unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_c + +void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_c + int vpx_highbd_satd_c(const tran_low_t *coeff, int length); #define vpx_highbd_satd vpx_highbd_satd_c +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +#define vpx_highbd_sse vpx_highbd_sse_c + void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c @@ -1185,10 +1270,10 @@ unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x8 vpx_mse8x8_c -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_c -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1197,7 +1282,7 @@ unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_c -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_c unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1206,7 +1291,7 @@ unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_c -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_c unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1215,7 +1300,7 @@ unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_c -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_c unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1224,7 +1309,7 @@ unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_c -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_c unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1233,7 +1318,7 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_c unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1242,7 +1327,7 @@ unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_c -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_c unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1251,7 +1336,7 @@ unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_c -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_c unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1260,7 +1345,7 @@ unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_c -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_c unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1269,7 +1354,7 @@ unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_c -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_c unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1278,7 +1363,7 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_c unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1287,7 +1372,7 @@ unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_c -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_c unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1296,7 +1381,7 @@ unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_c -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_c unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1305,9 +1390,87 @@ unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_c -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_c +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c @@ -1329,6 +1492,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +#define vpx_sse vpx_sse_c + uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h index fa2cc50fd..00ab40fe2 100644 --- a/config/generic/vpx_version.h +++ b/config/generic/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 13 +#define VERSION_MINOR 14 #define VERSION_PATCH 0 -#define VERSION_EXTRA "1559-gcd7dbca207" +#define VERSION_EXTRA "1616-g26104bbc9d" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207" -#define VERSION_STRING " v1.13.0-1559-gcd7dbca207" +#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d" +#define VERSION_STRING " v1.14.0-1616-g26104bbc9d" diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h index 5f7b32673..99ea7c278 100644 --- a/config/x86/vp8_rtcd.h +++ b/config/x86/vp8_rtcd.h @@ -45,15 +45,6 @@ void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_lin void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3 -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c - int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_sse2 diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h index 580d55a28..bd3233e20 100644 --- a/config/x86/vp9_rtcd.h +++ b/config/x86/vp9_rtcd.h @@ -21,7 +21,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -39,7 +41,7 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_sse2 -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -83,10 +85,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c -void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); @@ -104,13 +106,13 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_ssse3 -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm index 8c108efe9..360909d1e 100644 --- a/config/x86/vpx_config.asm +++ b/config/x86/vpx_config.asm @@ -1,11 +1,15 @@ %define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 %define VPX_ARCH_MIPS 0 %define VPX_ARCH_X86 1 %define VPX_ARCH_X86_64 0 %define VPX_ARCH_PPC 0 %define VPX_ARCH_LOONGARCH 0 -%define HAVE_NEON 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -75,7 +79,6 @@ %define CONFIG_MULTI_RES_ENCODING 0 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 1 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -88,3 +91,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h index 6cc7eda34..a7f41a9ae 100644 --- a/config/x86/vpx_config.h +++ b/config/x86/vpx_config.h @@ -11,13 +11,17 @@ #define RESTRICT #define INLINE inline #define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 #define VPX_ARCH_MIPS 0 #define VPX_ARCH_X86 1 #define VPX_ARCH_X86_64 0 #define VPX_ARCH_PPC 0 #define VPX_ARCH_LOONGARCH 0 -#define HAVE_NEON 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -87,7 +91,6 @@ #define CONFIG_MULTI_RES_ENCODING 0 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -100,6 +103,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 4096 #define DECODE_HEIGHT_LIMIT 3072 #endif /* VPX_CONFIG_H */ diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h index 91242deee..67c150490 100644 --- a/config/x86/vpx_dsp_rtcd.h +++ b/config/x86/vpx_dsp_rtcd.h @@ -15,6 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -1185,12 +1189,12 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t * void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2 -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1201,8 +1205,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2 -void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1213,8 +1217,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2 -void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1225,8 +1229,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2 -void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1237,8 +1241,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2 -void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1249,8 +1253,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2 -void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1261,8 +1265,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2 -void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1271,8 +1275,8 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c -void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1281,8 +1285,8 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c -void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1293,8 +1297,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2 -void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1305,8 +1309,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2 -void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1317,8 +1321,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2 -void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1329,8 +1333,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2 -void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1341,13 +1345,115 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2 -void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2 +unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_sse2 + +void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_sse2 + +void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_sse2 + +void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_sse2 + +void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_sse2 + +void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_sse2 + +void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_sse2 + +unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c + +void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c + +unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c + +void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_sse2 + +unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_sse2 + +void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_sse2 + +void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_sse2 + +unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_sse2 + +void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c + +void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c + +unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_sse2 + +void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_sse2 + int vpx_highbd_satd_c(const tran_low_t *coeff, int length); #define vpx_highbd_satd vpx_highbd_satd_c +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +#define vpx_highbd_sse vpx_highbd_sse_c + void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c @@ -1537,13 +1643,13 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_ssse3 -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1554,8 +1660,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1566,8 +1672,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1578,8 +1684,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1590,8 +1696,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2 -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1602,8 +1708,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2 -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1614,8 +1720,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2 -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1626,8 +1732,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1638,8 +1744,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1650,8 +1756,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2 -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1662,8 +1768,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2 -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1674,8 +1780,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1686,8 +1792,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1698,10 +1804,110 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_sse2 + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_sse2 + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_sse2 + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_sse2 + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_sse2 + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_sse2 + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_sse2 + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_sse2 + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_sse2 + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_sse2 + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 + int vpx_satd_c(const tran_low_t *coeff, int length); int vpx_satd_sse2(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_sse2 @@ -1725,6 +1931,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +#define vpx_sse vpx_sse_c + uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h index fa2cc50fd..00ab40fe2 100644 --- a/config/x86/vpx_version.h +++ b/config/x86/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 13 +#define VERSION_MINOR 14 #define VERSION_PATCH 0 -#define VERSION_EXTRA "1559-gcd7dbca207" +#define VERSION_EXTRA "1616-g26104bbc9d" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207" -#define VERSION_STRING " v1.13.0-1559-gcd7dbca207" +#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d" +#define VERSION_STRING " v1.14.0-1616-g26104bbc9d" diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h index 5f7b32673..99ea7c278 100644 --- a/config/x86_64/vp8_rtcd.h +++ b/config/x86_64/vp8_rtcd.h @@ -45,15 +45,6 @@ void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_lin void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3 -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c - int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_sse2 diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h index 580d55a28..bd3233e20 100644 --- a/config/x86_64/vp9_rtcd.h +++ b/config/x86_64/vp9_rtcd.h @@ -21,7 +21,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -39,7 +41,7 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_sse2 -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -83,10 +85,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c -void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); @@ -104,13 +106,13 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_ssse3 -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm index fdc51d32e..d0161162e 100644 --- a/config/x86_64/vpx_config.asm +++ b/config/x86_64/vpx_config.asm @@ -1,11 +1,15 @@ %define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 %define VPX_ARCH_MIPS 0 %define VPX_ARCH_X86 0 %define VPX_ARCH_X86_64 1 %define VPX_ARCH_PPC 0 %define VPX_ARCH_LOONGARCH 0 -%define HAVE_NEON 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -75,7 +79,6 @@ %define CONFIG_MULTI_RES_ENCODING 0 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 1 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -88,3 +91,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h index c624a9f8e..f33cb37d3 100644 --- a/config/x86_64/vpx_config.h +++ b/config/x86_64/vpx_config.h @@ -11,13 +11,17 @@ #define RESTRICT #define INLINE inline #define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 #define VPX_ARCH_MIPS 0 #define VPX_ARCH_X86 0 #define VPX_ARCH_X86_64 1 #define VPX_ARCH_PPC 0 #define VPX_ARCH_LOONGARCH 0 -#define HAVE_NEON 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -87,7 +91,6 @@ #define CONFIG_MULTI_RES_ENCODING 0 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -100,6 +103,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 4096 #define DECODE_HEIGHT_LIMIT 3072 #endif /* VPX_CONFIG_H */ diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h index 22401f1c0..5eb512172 100644 --- a/config/x86_64/vpx_dsp_rtcd.h +++ b/config/x86_64/vpx_dsp_rtcd.h @@ -15,6 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -1192,12 +1196,12 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t * void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max); #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2 -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1208,8 +1212,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2 -void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1220,8 +1224,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2 -void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1232,8 +1236,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2 -void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1244,8 +1248,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2 -void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1256,8 +1260,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2 -void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1268,8 +1272,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2 -void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1278,8 +1282,8 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c -void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1288,8 +1292,8 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c -void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1300,8 +1304,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2 -void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1312,8 +1316,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2 -void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1324,8 +1328,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2 -void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1336,8 +1340,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2 -void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1348,13 +1352,115 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2 -void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2 +unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_sse2 + +void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_sse2 + +void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_sse2 + +void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_sse2 + +void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_sse2 + +void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_sse2 + +void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_sse2 + +unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c + +void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c + +unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c + +void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_sse2 + +unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_sse2 + +void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_sse2 + +unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_sse2 + +void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_sse2 + +unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_sse2 + +void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_sse2 + +unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c + +void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c + +unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_sse2 + +void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_sse2 + int vpx_highbd_satd_c(const tran_low_t *coeff, int length); #define vpx_highbd_satd vpx_highbd_satd_c +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height); +#define vpx_highbd_sse vpx_highbd_sse_c + void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c @@ -1544,13 +1650,13 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_ssse3 -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1561,8 +1667,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1573,8 +1679,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1585,8 +1691,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1597,8 +1703,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2 -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1609,8 +1715,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2 -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1621,8 +1727,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2 -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1633,8 +1739,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1645,8 +1751,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1657,8 +1763,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2 -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1669,8 +1775,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2 -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1681,8 +1787,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1693,8 +1799,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -1705,10 +1811,110 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_sse2 + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_sse2 + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_sse2 + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_sse2 + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_sse2 + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_sse2 + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_sse2 + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_sse2 + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_sse2 + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_sse2 + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 + int vpx_satd_c(const tran_low_t *coeff, int length); int vpx_satd_sse2(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_sse2 @@ -1732,6 +1938,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height); +#define vpx_sse vpx_sse_c + uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h index fa2cc50fd..00ab40fe2 100644 --- a/config/x86_64/vpx_version.h +++ b/config/x86_64/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 13 +#define VERSION_MINOR 14 #define VERSION_PATCH 0 -#define VERSION_EXTRA "1559-gcd7dbca207" +#define VERSION_EXTRA "1616-g26104bbc9d" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207" -#define VERSION_STRING " v1.13.0-1559-gcd7dbca207" +#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d" +#define VERSION_STRING " v1.14.0-1616-g26104bbc9d" @@ -102,11 +102,14 @@ all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-darwin20-gcc" all_platforms="${all_platforms} arm64-darwin21-gcc" all_platforms="${all_platforms} arm64-darwin22-gcc" +all_platforms="${all_platforms} arm64-darwin23-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs16-clangcl" all_platforms="${all_platforms} arm64-win64-vs17" +all_platforms="${all_platforms} arm64-win64-vs17-clangcl" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 @@ -163,6 +166,7 @@ all_platforms="${all_platforms} x86_64-darwin19-gcc" all_platforms="${all_platforms} x86_64-darwin20-gcc" all_platforms="${all_platforms} x86_64-darwin21-gcc" all_platforms="${all_platforms} x86_64-darwin22-gcc" +all_platforms="${all_platforms} x86_64-darwin23-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" @@ -243,12 +247,21 @@ CODEC_FAMILIES=" ARCH_LIST=" arm + aarch64 mips x86 x86_64 ppc loongarch " + +ARCH_EXT_LIST_AARCH64=" + neon + neon_dotprod + neon_i8mm + sve +" + ARCH_EXT_LIST_X86=" mmx sse @@ -268,8 +281,8 @@ ARCH_EXT_LIST_LOONGSON=" " ARCH_EXT_LIST=" - neon neon_asm + ${ARCH_EXT_LIST_AARCH64} mips32 dspr2 @@ -293,6 +306,7 @@ EXPERIMENT_LIST=" emulate_hardware non_greedy_mv rate_ctrl + collect_component_timing " CONFIG_LIST=" dependency_tracking @@ -342,7 +356,6 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking vp9_highbitdepth better_hw_compatibility @@ -406,7 +419,6 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking better_hw_compatibility vp9_highbitdepth @@ -633,7 +645,6 @@ process_toolchain() { if enabled gcc; then enabled werror && check_add_cflags -Werror check_add_cflags -Wall - check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization check_add_cflags -Wextra-semi check_add_cflags -Wextra-semi-stmt @@ -647,8 +658,9 @@ process_toolchain() { check_add_cflags -Wimplicit-function-declaration check_add_cflags -Wmissing-declarations check_add_cflags -Wmissing-prototypes + check_add_cflags -Wshadow check_add_cflags -Wuninitialized - check_add_cflags -Wunreachable-code-loop-increment + check_add_cflags -Wunreachable-code-aggressive check_add_cflags -Wunused check_add_cflags -Wextra # check_add_cflags also adds to cxxflags. gtest does not do well with @@ -659,9 +671,8 @@ process_toolchain() { if enabled mips || [ -z "${INLINE}" ]; then enabled extra_warnings || check_add_cflags -Wno-unused-function fi - # Enforce c89 for c files. Don't be too strict about it though. Allow - # gnu extensions like "//" for comments. - check_cflags -std=gnu89 && add_cflags_only -std=gnu89 + # Enforce C99 for C files. Allow GNU extensions. + check_cflags -std=gnu99 && add_cflags_only -std=gnu99 # Avoid this warning for third_party C++ sources. Some reorganization # would be needed to apply this only to test/*.cc. check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 @@ -676,14 +687,18 @@ process_toolchain() { check_add_cxxflags -Wc++14-extensions check_add_cxxflags -Wc++17-extensions check_add_cxxflags -Wc++20-extensions + check_add_cxxflags -Wnon-virtual-dtor - # disable some warnings specific to libyuv. + # disable some warnings specific to libyuv / libwebm. check_cxxflags -Wno-missing-declarations \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" check_cxxflags -Wno-missing-prototypes \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" check_cxxflags -Wno-pass-failed \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" + check_cxxflags -Wno-shadow \ + && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow" check_cxxflags -Wno-unused-parameter \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi diff --git a/examples.mk b/examples.mk index 42886f1e1..22726a3d4 100644 --- a/examples.mk +++ b/examples.mk @@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \ # Add compile flags and include path for libwebm sources. ifeq ($(CONFIG_WEBM_IO),yes) CXXFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS + $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm endif @@ -81,8 +82,6 @@ ifeq ($(CONFIG_LIBYUV),yes) $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS} endif ifeq ($(CONFIG_WEBM_IO),yes) - vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS) - vpxdec.SRCS += $(LIBWEBM_MUXER_SRCS) vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS) vpxdec.SRCS += webmdec.cc webmdec.h endif diff --git a/examples/resize_util.c b/examples/resize_util.c index 7e529b2e2..5fb63e166 100644 --- a/examples/resize_util.c +++ b/examples/resize_util.c @@ -52,6 +52,7 @@ int main(int argc, char *argv[]) { uint8_t *inbuf_v, *outbuf_v; int f, frames; int width, height, target_width, target_height; + int failed = 0; exec_name = argv[0]; @@ -82,6 +83,7 @@ int main(int argc, char *argv[]) { } fpout = fopen(fout, "wb"); if (fpout == NULL) { + fclose(fpin); printf("Can't open file %s to write\n", fout); usage(); return 1; @@ -100,6 +102,11 @@ int main(int argc, char *argv[]) { inbuf = (uint8_t *)malloc(width * height * 3 / 2); outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); + if (!(inbuf && outbuf)) { + printf("Failed to allocate buffers.\n"); + failed = 1; + goto Error; + } inbuf_u = inbuf + width * height; inbuf_v = inbuf_u + width * height / 4; outbuf_u = outbuf + target_width * target_height; @@ -114,10 +121,11 @@ int main(int argc, char *argv[]) { f++; } printf("%d frames processed\n", f); +Error: fclose(fpin); fclose(fpout); free(inbuf); free(outbuf); - return 0; + return failed; } diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c index 003096e70..1dd731765 100644 --- a/examples/svc_encodeframe.c +++ b/examples/svc_encodeframe.c @@ -279,7 +279,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { if (svc_ctx == NULL || options == NULL || si == NULL) { return VPX_CODEC_INVALID_PARAM; } - strncpy(si->options, options, sizeof(si->options)); + strncpy(si->options, options, sizeof(si->options) - 1); si->options[sizeof(si->options) - 1] = '\0'; return VPX_CODEC_OK; } @@ -381,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *enc_cfg) { vpx_codec_err_t res; - int i, sl, tl; + int sl, tl; SvcInternal_t *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL || enc_cfg == NULL) { @@ -433,7 +433,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - i = sl * svc_ctx->temporal_layers + tl; + const int i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; if (enc_cfg->rc_end_usage == VPX_CBR && @@ -503,7 +503,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - i = sl * svc_ctx->temporal_layers + tl; + const int i = sl * svc_ctx->temporal_layers + tl; if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS) { si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer; diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index d287e5831..998e4fb20 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -32,6 +32,7 @@ #include "vp9/encoder/vp9_encoder.h" #include "./y4minput.h" +#define OUTPUT_FRAME_STATS 0 #define OUTPUT_RC_STATS 1 #define SIMULCAST_MODE 0 @@ -315,7 +316,6 @@ static void parse_command_line(int argc, const char **argv_, break; default: die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); - break; } #endif // CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { @@ -880,7 +880,9 @@ int main(int argc, const char **argv) { int pts = 0; /* PTS starts at 0 */ int frame_duration = 1; /* 1 timebase tick per frame */ int end_of_stream = 0; +#if OUTPUT_FRAME_STATS int frames_received = 0; +#endif #if OUTPUT_RC_STATS VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL }; struct RateControlStats rc; @@ -1126,14 +1128,14 @@ int main(int argc, const char **argv) { } #endif } - /* +#if OUTPUT_FRAME_STATS printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); - */ + ++frames_received; +#endif if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; - ++frames_received; #if CONFIG_VP9_DECODER && !SIMULCAST_MODE if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf, (unsigned int)cx_pkt->data.frame.sz, NULL, 0)) diff --git a/generate_config.sh b/generate_config.sh index 79700d535..a8a43e149 100755 --- a/generate_config.sh +++ b/generate_config.sh @@ -210,7 +210,8 @@ intel="--disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm" gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}" gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}" gen_config_files arm-neon "--target=armv7-linux-gcc ${all_platforms}" -gen_config_files arm64 "--force-target=armv8-linux-gcc ${all_platforms}" +arm64="--disable-neon_dotprod --disable-neon_i8mm" +gen_config_files arm64 "--target=armv8-linux-gcc ${arm64} ${all_platforms}" gen_config_files generic "--target=generic-gnu ${all_platforms}" echo "Remove temporary directory." @@ -233,7 +234,7 @@ cd $TEMP_DIR gen_rtcd_header x86 x86 "${intel}" gen_rtcd_header x86_64 x86_64 "${intel}" gen_rtcd_header arm-neon armv7 -gen_rtcd_header arm64 armv8 +gen_rtcd_header arm64 armv8 "${arm64}" gen_rtcd_header generic generic echo "Prepare Makefile." @@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) @@ -312,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 8 +SO_VERSION_MAJOR := 9 SO_VERSION_MINOR := 0 -SO_VERSION_PATCH := 1 +SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib SHARED_LIB_SUF := .dylib @@ -545,7 +546,7 @@ testdata: $(LIBVPX_TEST_DATA) echo "Checking test data:";\ for f in $(call enabled,LIBVPX_TEST_DATA); do\ grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ - (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\ + (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\ done; \ else\ echo "Skipping test data integrity check, sha1sum not found.";\ @@ -631,8 +632,8 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^ endif # RC_INTERFACE_TEST -endif # CONFIG_VP9_ENCODER -endif +endif # CONFIG_ENCODERS +endif # CONFIG_MSVS else include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk @@ -699,7 +700,7 @@ $(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \ -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm)) endif # SIMPLE_ENCODE_TEST -endif # CONFIG_UNIT_TESTS +endif # CONFIG_EXTERNAL_BUILD # Install test sources only if codec source is included INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ @@ -724,7 +725,7 @@ NUM_SHARDS := 10 SHARDS := 0 1 2 3 4 5 6 7 8 9 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS)))) -endif +endif # CONFIG_UNIT_TESTS ## ## documentation directives @@ -764,10 +765,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH)) endif utiltest utiltest-no-data-check: $(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) $(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) utiltest: testdata else @@ -791,7 +792,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release endif exampletest exampletest-no-data-check: examples $(qexec)$(SRC_PATH_BARE)/test/examples.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(EXAMPLES_BIN_PATH) exampletest: testdata else diff --git a/test/acm_random.h b/test/acm_random.h index c7122b933..e3520c47d 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -45,16 +45,11 @@ class ACMRandom { return static_cast<int16_t>(random_.Generate(65536)); } - int16_t Rand13Signed() { - // Use 13 bits: values between 4095 and -4096. - const uint32_t value = random_.Generate(8192); - return static_cast<int16_t>(value) - 4096; - } - - int16_t Rand9Signed() { - // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). - const uint32_t value = random_.Generate(512); - return static_cast<int16_t>(value) - 256; + uint16_t Rand12() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 19) & 0xfff; } uint8_t Rand8() { diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc index 68d8856ea..ad067346a 100644 --- a/test/active_map_refresh_test.cc +++ b/test/active_map_refresh_test.cc @@ -62,16 +62,16 @@ class ActiveMapRefreshTest public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> { protected: ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapRefreshTest() {} + ~ActiveMapRefreshTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { ::libvpx_test::Y4mVideoSource *y4m_video = static_cast<libvpx_test::Y4mVideoSource *>(video); if (video->frame() == 0) { diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 543ec0d35..d222c00b7 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -26,16 +26,16 @@ class ActiveMapTest static const int kHeight = 144; ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapTest() {} + ~ActiveMapTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3)); diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 7dc86e3eb..4fc4e81e6 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -32,8 +32,8 @@ typedef std::tuple<double, AddNoiseFunc> AddNoiseTestFPParam; class AddNoiseTest : public ::testing::Test, public ::testing::WithParamInterface<AddNoiseTestFPParam> { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } - virtual ~AddNoiseTest() {} + void TearDown() override { libvpx_test::ClearSystemState(); } + ~AddNoiseTest() override = default; }; double stddev6(char a, char b, char c, char d, char e, char f) { diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc index 00a00e27c..3b1a26ed1 100644 --- a/test/alt_ref_aq_segment_test.cc +++ b/test/alt_ref_aq_segment_test.cc @@ -20,9 +20,9 @@ class AltRefAqSegmentTest public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> { protected: AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AltRefAqSegmentTest() {} + ~AltRefAqSegmentTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); @@ -30,8 +30,8 @@ class AltRefAqSegmentTest alt_ref_aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_); diff --git a/test/altref_test.cc b/test/altref_test.cc index 69bcef774..903230fde 100644 --- a/test/altref_test.cc +++ b/test/altref_test.cc @@ -24,24 +24,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam<int> { protected: AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} - virtual ~AltRefTest() {} + ~AltRefTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; } + void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_CPUUSED, 3); } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_; } @@ -75,17 +75,17 @@ class AltRefForcedKeyTestLarge AltRefForcedKeyTestLarge() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} - virtual ~AltRefForcedKeyTestLarge() {} + ~AltRefForcedKeyTestLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); cfg_.rc_end_usage = VPX_VBR; cfg_.g_threads = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); @@ -100,7 +100,7 @@ class AltRefForcedKeyTestLarge (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame_num_ == forced_kf_frame_num_) { ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) << "Frame #" << frame_num_ << " isn't a keyframe!"; diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc index 2cbc991d0..955e1dafc 100644 --- a/test/aq_segment_test.cc +++ b/test/aq_segment_test.cc @@ -20,17 +20,17 @@ class AqSegmentTest public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> { protected: AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AqSegmentTest() {} + ~AqSegmentTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); diff --git a/test/avg_test.cc b/test/avg_test.cc index 196522ce5..ede9c0ba8 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -38,7 +38,7 @@ class AverageTestBase : public ::testing::Test { : width_(width), height_(height), source_data_(nullptr), source_stride_(0), bit_depth_(8) {} - virtual void TearDown() { + void TearDown() override { vpx_free(source_data_); source_data_ = nullptr; libvpx_test::ClearSystemState(); @@ -49,7 +49,7 @@ class AverageTestBase : public ::testing::Test { static const int kDataAlignment = 16; static const int kDataBlockSize = 64 * 128; - virtual void SetUp() { + void SetUp() override { source_data_ = reinterpret_cast<Pixel *>( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); ASSERT_NE(source_data_, nullptr); @@ -169,7 +169,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>, } protected: - virtual void SetUp() { + void SetUp() override { source_data_ = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); ASSERT_NE(source_data_, nullptr); @@ -180,7 +180,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>, vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16)); } - virtual void TearDown() { + void TearDown() override { vpx_free(source_data_); source_data_ = nullptr; vpx_free(hbuf_c_); @@ -190,8 +190,9 @@ class IntProRowTest : public AverageTestBase<uint8_t>, } void RunComparison() { - ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_)); - ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_)); + ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_)); + ASM_REGISTER_STATE_CHECK( + asm_func_(hbuf_asm_, source_data_, width_, height_)); EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16)) << "Output mismatch"; } @@ -238,7 +239,7 @@ typedef std::tuple<int, SatdFunc> SatdTestParam; class SatdTest : public ::testing::Test, public ::testing::WithParamInterface<SatdTestParam> { protected: - virtual void SetUp() { + void SetUp() override { satd_size_ = GET_PARAM(0); satd_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -247,7 +248,7 @@ class SatdTest : public ::testing::Test, ASSERT_NE(src_, nullptr); } - virtual void TearDown() { + void TearDown() override { libvpx_test::ClearSystemState(); vpx_free(src_); } @@ -276,7 +277,7 @@ class SatdTest : public ::testing::Test, class SatdLowbdTest : public SatdTest { protected: - virtual void FillRandom() { + void FillRandom() override { for (int i = 0; i < satd_size_; ++i) { const int16_t tmp = rnd_.Rand16Signed(); src_[i] = (tran_low_t)tmp; @@ -292,7 +293,7 @@ class BlockErrorTestFP : public ::testing::Test, public ::testing::WithParamInterface<BlockErrorTestFPParam> { protected: - virtual void SetUp() { + void SetUp() override { txfm_size_ = GET_PARAM(0); block_error_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -304,7 +305,7 @@ class BlockErrorTestFP ASSERT_NE(dqcoeff_, nullptr); } - virtual void TearDown() { + void TearDown() override { libvpx_test::ClearSystemState(); vpx_free(coeff_); vpx_free(dqcoeff_); @@ -463,7 +464,7 @@ TEST_P(SatdLowbdTest, DISABLED_Speed) { #if CONFIG_VP9_HIGHBITDEPTH class SatdHighbdTest : public SatdTest { protected: - virtual void FillRandom() { + void FillRandom() override { for (int i = 0; i < satd_size_; ++i) { src_[i] = rnd_.Rand20Signed(); } @@ -582,6 +583,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon))); +#endif // HAVE_NEON + INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), @@ -694,16 +702,21 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, make_tuple(256, &vpx_satd_neon), make_tuple(1024, &vpx_satd_neon))); -// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are -// in place. -#if !CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon), + make_tuple(64, &vpx_highbd_satd_neon), + make_tuple(256, &vpx_highbd_satd_neon), + make_tuple(1024, &vpx_highbd_satd_neon))); +#endif // CONFIG_VP9_HIGHBITDEPTH + INSTANTIATE_TEST_SUITE_P( NEON, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), make_tuple(64, &vp9_block_error_fp_neon), make_tuple(256, &vp9_block_error_fp_neon), make_tuple(1024, &vp9_block_error_fp_neon))); -#endif // !CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/bench.h b/test/bench.h index 57ca9118b..203e4d247 100644 --- a/test/bench.h +++ b/test/bench.h @@ -16,6 +16,8 @@ class AbstractBench { public: + virtual ~AbstractBench() = default; + void RunNTimes(int n); void PrintMedian(const char *title); diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 11b2a3f61..5a45bc0b7 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -49,14 +49,14 @@ class BlockinessTestBase : public ::testing::Test { reference_data_ = nullptr; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); diff --git a/test/borders_test.cc b/test/borders_test.cc index 3c1f69a92..2726bd557 100644 --- a/test/borders_test.cc +++ b/test/borders_test.cc @@ -22,15 +22,15 @@ class BordersTest public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: BordersTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~BordersTest() {} + ~BordersTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 1); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); @@ -40,7 +40,7 @@ class BordersTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { } } diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc index 1e0ffceb8..ba6fffc52 100644 --- a/test/byte_alignment_test.cc +++ b/test/byte_alignment_test.cc @@ -58,7 +58,7 @@ class ByteAlignmentTest ByteAlignmentTest() : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); ASSERT_NE(video_, nullptr); video_->Init(); @@ -71,7 +71,7 @@ class ByteAlignmentTest OpenMd5File(kVP9Md5File); } - virtual void TearDown() { + void TearDown() override { if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; diff --git a/test/codec_factory.h b/test/codec_factory.h index 96092610c..d00563df1 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -84,7 +84,7 @@ class VP8Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_DECODER return &vpx_codec_vp8_dx_algo; #else @@ -100,7 +100,7 @@ class VP8Encoder : public Encoder { : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_ENCODER return &vpx_codec_vp8_cx_algo; #else @@ -113,12 +113,12 @@ class VP8CodecFactory : public CodecFactory { public: VP8CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP8_DECODER return new VP8Decoder(cfg, flags); #else @@ -128,10 +128,9 @@ class VP8CodecFactory : public CodecFactory { #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP8_ENCODER return new VP8Encoder(cfg, deadline, init_flags, stats); #else @@ -143,8 +142,8 @@ class VP8CodecFactory : public CodecFactory { #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP8_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); #else @@ -180,7 +179,7 @@ class VP9Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_DECODER return &vpx_codec_vp9_dx_algo; #else @@ -196,7 +195,7 @@ class VP9Encoder : public Encoder { : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else @@ -209,12 +208,12 @@ class VP9CodecFactory : public CodecFactory { public: VP9CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP9_DECODER return new VP9Decoder(cfg, flags); #else @@ -224,10 +223,9 @@ class VP9CodecFactory : public CodecFactory { #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP9_ENCODER return new VP9Encoder(cfg, deadline, init_flags, stats); #else @@ -239,8 +237,8 @@ class VP9CodecFactory : public CodecFactory { #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP9_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); #else diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 70aeab8d7..3234cc9a2 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -49,7 +49,7 @@ using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, template <int bitdepth, typename Pixel> class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> { public: - virtual void SetUp() { + void SetUp() override { avg_pred_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } @@ -81,11 +81,11 @@ void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() { // Only the reference buffer may have a stride not equal to width. Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(avg_ref.Init()); - Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(avg_chk.Init()); const int bitdepth_mask = (1 << bitdepth) - 1; for (int h = 0; h < height; ++h) { @@ -121,11 +121,11 @@ void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() { const int height = 32; Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8); ASSERT_TRUE(ref.Init()); - Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(avg_ref.Init()); - Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(avg_chk.Init()); for (int i = 0; i < 500; ++i) { @@ -167,9 +167,9 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() { const int height = 1 << height_pow; Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16); + Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32); ASSERT_TRUE(avg.Init()); const int bitdepth_mask = (1 << bitdepth) - 1; for (int h = 0; h < height; ++h) { @@ -217,6 +217,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_avx2)); +#endif // HAVE_AVX2 + #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_neon)); @@ -260,5 +265,11 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>)); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>)); +#endif // HAVE_NEON + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/test/config_test.cc b/test/config_test.cc index 8f4c60e11..729b01151 100644 --- a/test/config_test.cc +++ b/test/config_test.cc @@ -22,24 +22,24 @@ class ConfigTest ConfigTest() : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} - virtual ~ConfigTest() {} + ~ConfigTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { frame_count_in_ = 0; frame_count_out_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) { + void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override { ++frame_count_in_; abort_ |= (frame_count_in_ >= frame_count_max_); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) { + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { ++frame_count_out_; } diff --git a/test/consistency_test.cc b/test/consistency_test.cc index f0e2cb297..5e872e70a 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -65,14 +65,14 @@ class ConsistencyTestBase : public ::testing::Test { delete[] ssim_array_; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); diff --git a/test/convolve_test.cc b/test/convolve_test.cc index d56904869..ffd5c41c6 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -244,7 +244,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, // Vertical pass (transposed intermediate -> dst). { - uint16_t *src_ptr = intermediate_buffer; + src_ptr = intermediate_buffer; const int dst_next_row_stride = dst_stride - output_width; unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -361,7 +361,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } static void TearDownTestSuite() { vpx_free(input_ - 1); @@ -403,7 +403,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { i % kOuterBlockSize >= (BorderLeft() + Width())); } - virtual void SetUp() { + void SetUp() override { UUT_ = GET_PARAM(2); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ != 0) { @@ -1423,6 +1423,36 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_neon)); #endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +const ConvolveFunctions convolve8_neon_dotprod( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod, + vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod, + vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod, + vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES( + convolve8_neon_dotprod) }; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_NEON_I8MM +const ConvolveFunctions convolve8_neon_i8mm( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, + vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm, + vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm, + vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES( + convolve8_neon_i8mm) }; +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_i8mm)); +#endif // HAVE_NEON_I8MM + #if HAVE_DSPR2 const ConvolveFunctions convolve8_dspr2( vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2, diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index a7623f09a..22f455296 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -26,9 +26,9 @@ class CpuSpeedTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), tune_content_(VP9E_CONTENT_DEFAULT) {} - virtual ~CpuSpeedTest() {} + ~CpuSpeedTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -40,10 +40,10 @@ class CpuSpeedTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; } + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); @@ -56,7 +56,7 @@ class CpuSpeedTest } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0]; } diff --git a/test/cq_test.cc b/test/cq_test.cc index 292adb0d0..b74915a33 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -50,21 +50,21 @@ class CQTest : public ::libvpx_test::EncoderTest, init_flags_ = VPX_CODEC_USE_PSNR; } - virtual ~CQTest() {} + ~CQTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { file_size_ = 0; psnr_ = 0.0; n_frames_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { if (cfg_.rc_end_usage == VPX_CQ) { encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); @@ -73,12 +73,12 @@ class CQTest : public ::libvpx_test::EncoderTest, } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0); n_frames_++; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { file_size_ += pkt->data.frame.sz; } diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index d4ef7ae13..8c4213ee1 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -27,6 +27,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -309,7 +310,7 @@ void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { class Trans16x16TestBase { public: - virtual ~Trans16x16TestBase() {} + virtual ~Trans16x16TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -548,12 +549,50 @@ class Trans16x16TestBase { } } + void RunSpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int c_sum_time = 0; + int simd_sum_time = 0; + + DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]); + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + vpx_fdct16x16_c(input_block, output_ref_block, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c)); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunFwdTxfm(input_block, output_block, pitch_); + } + + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod)); + + printf( + "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time, + simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + void CompareInvReference(IdctFunc ref_txfm, int thresh) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 10000; const int eob = 10; const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); #if CONFIG_VP9_HIGHBITDEPTH @@ -604,6 +643,80 @@ class Trans16x16TestBase { } } + void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + int pitch_; int tx_type_; vpx_bit_depth_t bit_depth_; @@ -615,9 +728,9 @@ class Trans16x16TestBase { class Trans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam<Dct16x16Param> { public: - virtual ~Trans16x16DCT() {} + ~Trans16x16DCT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -636,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase, inv_txfm_ref = idct16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -664,12 +777,14 @@ TEST_P(Trans16x16DCT, QuantCheck) { TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } +TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } + class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam<Ht16x16Param> { public: - virtual ~Trans16x16HT() {} + ~Trans16x16HT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -688,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase, inv_txfm_ref = iht16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -714,13 +829,12 @@ TEST_P(Trans16x16HT, QuantCheck) { RunQuantCheck(429, 729); } -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam<Idct16x16Param> { public: - virtual ~InvTrans16x16DCT() {} + ~InvTrans16x16DCT() override = default; - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -728,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase, pitch_ = 16; mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {} - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, + int /*stride*/) override {} + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -745,7 +860,10 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +TEST_P(InvTrans16x16DCT, DISABLED_Speed) { + RunInvTrans16x16SpeedTest(ref_txfm_, thresh_); +} using std::make_tuple; @@ -787,6 +905,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT, + ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_c, + 6225, VPX_BITS_8))); + #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -821,8 +945,25 @@ INSTANTIATE_TEST_SUITE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 91bb8e01e..6233b17a4 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -24,10 +24,12 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t> Trans32x32Param; +typedef std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int> + InvTrans32x32Param; + #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); @@ -84,8 +89,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { class Trans32x32Test : public AbstractBench, public ::testing::TestWithParam<Trans32x32Param> { public: - virtual ~Trans32x32Test() {} - virtual void SetUp() { + ~Trans32x32Test() override = default; + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); version_ = GET_PARAM(2); // 0: high precision forward transform @@ -94,7 +99,7 @@ class Trans32x32Test : public AbstractBench, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int version_; @@ -105,7 +110,7 @@ class Trans32x32Test : public AbstractBench, int16_t *bench_in_; tran_low_t *bench_out_; - virtual void Run(); + void Run() override; }; void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); } @@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } +class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> { + public: + ~InvTrans32x32Test() override = default; + void SetUp() override { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + eob_ = GET_PARAM(4); + thresh_ = GET_PARAM(4); + mask_ = (1 << bit_depth_) - 1; + pitch_ = 32; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { + ref_txfm_(out, dst, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + int version_; + vpx_bit_depth_t bit_depth_; + int mask_; + int eob_; + int thresh_; + + InvTxfmFunc ref_txfm_; + InvTxfmFunc inv_txfm_; + int pitch_; + + void RunInvTrans32x32SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob_) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh_); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + + void CompareInvReference32x32() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 31; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + coeff[scan[j]] = rnd.Rand8Extremes(); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + RunRefTxfm(coeff, ref, pitch_); + RunInvTxfm(coeff, dst, pitch_); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error " + << error << " at index " << j; + } + } + } +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test); + +TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); } +TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); } + using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + C, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0, + VPX_BITS_8, 16, 6255))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE @@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_avx2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index e57fa0f48..ec6f543f7 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -67,7 +67,7 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> { bit_depth_ = GET_PARAM(2); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: void RunTest() { diff --git a/test/dct_test.cc b/test/dct_test.cc index 0304029bd..c3d3081c4 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -134,7 +134,7 @@ void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size, class TransTestBase : public ::testing::TestWithParam<DctParam> { public: - virtual void SetUp() { + void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); const int idx = GET_PARAM(0); const FuncInfo *func_info = &(GET_PARAM(1)[idx]); @@ -166,7 +166,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> { ASSERT_NE(dst_, nullptr); } - virtual void TearDown() { + void TearDown() override { vpx_free(src_); src_ = nullptr; vpx_free(dst_); @@ -358,14 +358,6 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> { ASSERT_TRUE(in.Init()); Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16); ASSERT_TRUE(coeff.Init()); - Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16); - ASSERT_TRUE(dst.Init()); - Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0); - ASSERT_TRUE(src.Init()); - Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16); - ASSERT_TRUE(dst16.Init()); - Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0); - ASSERT_TRUE(src16.Init()); for (int i = 0; i < count_test_block; ++i) { InitMem(); @@ -671,8 +663,12 @@ static const FuncInfo ht_neon_func_info[] = { 4, 2 }, { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8, 2 }, + { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, + 8, 2 }, { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 }, + { &vp9_highbd_fht16x16_neon, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 }, #endif { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 }, { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 }, diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc index 9e82ace1b..44e439772 100644 --- a/test/decode_api_test.cc +++ b/test/decode_api_test.cc @@ -20,7 +20,7 @@ namespace { #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0])) TEST(DecodeAPI, InvalidParams) { - static const vpx_codec_iface_t *kCodecs[] = { + static vpx_codec_iface_t *kCodecs[] = { #if CONFIG_VP8_DECODER &vpx_codec_vp8_dx_algo, #endif @@ -120,7 +120,7 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) { } TEST(DecodeAPI, Vp9InvalidDecode) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; const char filename[] = "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"; libvpx_test::IVFVideoSource video(filename); @@ -147,7 +147,7 @@ TEST(DecodeAPI, Vp9InvalidDecode) { void TestPeekInfo(const uint8_t *const data, uint32_t data_sz, uint32_t peek_size) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get // to decoder_peek_si_internal on frames of size < 8. if (data_sz >= 8) { diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc index 31e1da69c..58773d7b8 100644 --- a/test/decode_corrupted.cc +++ b/test/decode_corrupted.cc @@ -28,9 +28,9 @@ class DecodeCorruptedFrameTest DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} protected: - virtual ~DecodeCorruptedFrameTest() {} + ~DecodeCorruptedFrameTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_lag_in_frames = 0; @@ -44,16 +44,16 @@ class DecodeCorruptedFrameTest dec_cfg_.threads = 1; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7); } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) {} + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} - virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( - const vpx_codec_cx_pkt_t *pkt) { + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { // Don't edit frame packet on key frame. if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt; if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; @@ -66,9 +66,9 @@ class DecodeCorruptedFrameTest return &modified_pkt_; } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError(); return VPX_CODEC_MEM_ERROR != res_dec; } diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index e07a66744..383fd2d89 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -116,11 +116,11 @@ class VP9NewEncodeDecodePerfTest protected: VP9NewEncodeDecodePerfTest() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), - outfile_(0), out_frames_(0) {} + outfile_(nullptr), out_frames_(0) {} - virtual ~VP9NewEncodeDecodePerfTest() {} + ~VP9NewEncodeDecodePerfTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -137,8 +137,8 @@ class VP9NewEncodeDecodePerfTest cfg_.rc_end_usage = VPX_VBR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, speed_); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); @@ -146,14 +146,14 @@ class VP9NewEncodeDecodePerfTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; outfile_ = fopen(path_to_source.c_str(), "wb"); ASSERT_NE(outfile_, nullptr); } - virtual void EndPassHook() { + void EndPassHook() override { if (outfile_ != nullptr) { if (!fseek(outfile_, 0, SEEK_SET)) { ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); @@ -163,7 +163,7 @@ class VP9NewEncodeDecodePerfTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -177,7 +177,7 @@ class VP9NewEncodeDecodePerfTest pkt->data.frame.sz); } - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } void set_speed(unsigned int speed) { speed_ = speed; } diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc index ec9935da7..7098e7b27 100644 --- a/test/decode_svc_test.cc +++ b/test/decode_svc_test.cc @@ -25,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam<const char *> { protected: DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} - virtual ~DecodeSvcTest() {} + ~DecodeSvcTest() override = default; - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (video.frame_number() == 0) decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_EQ(img.d_w, width_); ASSERT_EQ(img.d_h, height_); total_frames_ = frame_number; diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index e0e793b15..ee7d8d27c 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -11,17 +11,24 @@ #include <climits> #include <cstring> #include <initializer_list> +#include <new> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" #include "./vpx_config.h" -#include "test/video_source.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" #include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" +#include "vpx/vpx_tpl.h" namespace { -const vpx_codec_iface_t *kCodecIfaces[] = { +vpx_codec_iface_t *kCodecIfaces[] = { #if CONFIG_VP8_ENCODER &vpx_codec_vp8_cx_algo, #endif @@ -30,7 +37,7 @@ const vpx_codec_iface_t *kCodecIfaces[] = { #endif }; -bool IsVP9(const vpx_codec_iface_t *iface) { +bool IsVP9(vpx_codec_iface_t *iface) { static const char kVP9Name[] = "WebM Project VP9"; return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == 0; @@ -118,6 +125,62 @@ TEST(EncodeAPI, ImageSizeSetting) { vpx_codec_destroy(&enc); } + +// Verifies the fix for a float-cast-overflow in vp8_change_config(). +// +// Causes cpi->framerate to become the largest possible value (10,000,000) in +// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to +// vpx_codec_encode(). +TEST(EncodeAPI, HugeFramerateVp8) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = 271; + cfg.g_h = 1080; + cfg.g_timebase.num = 1; + // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1 + // tick). + cfg.g_timebase.den = 10000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = VPX_CBR; + + vpx_codec_ctx_t enc; + // Before we encode the first frame, cpi->framerate is set to a guess (the + // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable + // (> 180), cpi->framerate is set to 30. + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK); + + vpx_image_t *const image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1); + ASSERT_NE(image, nullptr); + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + // Encode a frame. + // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This + // causes cpi->framerate to become 10,000,000. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Change to the same config. Since cpi->framerate is now huge, when it is + // used to calculate raw_target_rate (bit rate of uncompressed frames), the + // result is likely to overflow an unsigned int. + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} #endif // Set up 2 spatial streams with 2 temporal layers per stream, and generate @@ -196,7 +259,7 @@ TEST(EncodeAPI, MultiResEncode) { TEST(EncodeAPI, SetRoi) { static struct { - const vpx_codec_iface_t *iface; + vpx_codec_iface_t *iface; int ctrl_id; } kCodecs[] = { #if CONFIG_VP8_ENCODER @@ -302,7 +365,7 @@ TEST(EncodeAPI, SetRoi) { } } -void InitCodec(const vpx_codec_iface_t &iface, int width, int height, +void InitCodec(vpx_codec_iface_t &iface, int width, int height, vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { cfg->g_w = width; cfg->g_h = height; @@ -392,7 +455,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) << vpx_codec_error_detail(&enc.ctx); - cfg.g_w = 16; + cfg.g_w = 1000; cfg.g_h = 720; for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { @@ -404,4 +467,451 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } } +#if CONFIG_VP9_ENCODER +// Frame size needed to trigger the overflow exceeds the max buffer allowed on +// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY +#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 +TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { + constexpr int kWidth = 12383; + constexpr int kHeight = 8192; + constexpr auto *iface = &vpx_codec_vp9_cx_algo; + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // The following setting will cause avg_frame_bandwidth in rate control to be + // larger than INT_MAX + cfg.rc_target_bitrate = INT_MAX; + // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed + // by libvpx + cfg.g_timebase.den = 1; + cfg.g_timebase.num = 10; + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)) + << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " + << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num; +} +#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 + +vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { + vpx_image_t *image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + return image; +} + +// Emulates the WebCodecs VideoEncoder interface. +class VP9Encoder { + public: + VP9Encoder(int speed) : speed_(speed) {} + ~VP9Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, unsigned long deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + unsigned long deadline_ = 0; +}; + +VP9Encoder::~VP9Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP9Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + unsigned long deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP9Encoder::Encode(bool key_frame) { + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ( + vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), + VPX_CODEC_OK); + frame_index_++; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); +} + +// This is a test case from clusterfuzz. +TEST(EncodeAPI, PrevMiCheckNullptr) { + VP9Encoder encoder(0); + encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME); + + // First step: encode, without forcing KF. + encoder.Encode(false); + // Second step: change config + encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY); + // Third step: encode, without forcing KF + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310477034. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, MultipleChangeConfigResize) { + VP9Encoder encoder(3); + + // Set initial config. + encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config again. + encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 3rd frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310663186. +// Encode set of frames while varying the deadline on the fly from +// good to realtime to best and back to realtime. +TEST(EncodeAPI, DynamicDeadlineChange) { + // Use realtime speed: 5 to 9. + VP9Encoder encoder(5); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); + + // Encode 4th frame with same config, delta frame. + encoder.Encode(false); + + // Encode 5th frame with same config, key frame. + encoder.Encode(true); + + // Change config: change deadline to BEST. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY); + + // Encode 6th frame with new config, set delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 7th frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 8th frame with new config, set key frame. + encoder.Encode(true); + + // Encode 9th frame with new config, set delta frame. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer310340241) { + VP9Encoder encoder(-6); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); +} + +// This is a test case from clusterfuzz: based on b/312517065. +TEST(EncodeAPI, Buganizer312517065) { + VP9Encoder encoder(4); + encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(false); + encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311489136. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311489136) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/312656387. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer312656387) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config again. + encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config. + encoder.Encode(true); + + // Change config again. + encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310329177. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer310329177) { + VP9Encoder encoder(6); + + // Set initial config. + encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311394513. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311394513) { + VP9Encoder encoder(-7); + + // Set initial config. + encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(true); +} + +TEST(EncodeAPI, Buganizer311985118) { + VP9Encoder encoder(0); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(false); + + // Change config: change threads and width. + encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config: change threads, width and height. + encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/314857577. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer314857577) { + VP9Encoder encoder(4); + + // Set initial config. + encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(false); + + // Encode 3rd frame with new config. + encoder.Encode(true); + + // Change config. + encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with new config. + encoder.Encode(true); + + // Encode 5th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + + // Encode 6th frame with new config. + encoder.Encode(false); + + // Encode 7th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode 8th frame with new config. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer312875957PredBufferStride) { + VP9Encoder encoder(-1); + + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Encode(false); + encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(0, 1678, 620, VPX_CBR, 1000000); + encoder.Encode(false); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311294795 +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311294795) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME); + // Encode more frames with new config. + encoder.Encode(false); + encoder.Encode(false); +} +#endif // CONFIG_VP9_ENCODER + } // namespace diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 142a55952..171ff8eec 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -61,9 +61,9 @@ class VP9EncodePerfTest : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} - virtual ~VP9EncodePerfTest() {} + ~VP9EncodePerfTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -82,8 +82,8 @@ class VP9EncodePerfTest cfg_.g_threads = threads_; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { const int log2_tile_columns = 3; encoder->Control(VP8E_SET_CPUUSED, speed_); @@ -93,19 +93,19 @@ class VP9EncodePerfTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) { min_psnr_ = pkt->data.psnr.psnr[0]; } } // for performance reasons don't decode - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } double min_psnr() const { return min_psnr_; } diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index b57df8529..c7974894c 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -19,7 +19,7 @@ #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER #include "vpx/vp8cx.h" #endif -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace libvpx_test { @@ -153,6 +153,11 @@ class Encoder { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + + void Control(int ctrl_id, VpxTplGopStats *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } #endif // CONFIG_VP9_ENCODER #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER @@ -259,7 +264,7 @@ class EncoderTest { const CodecFactory *codec_; // Hook to determine whether to decode frame after encoding - virtual bool DoDecode() const { return 1; } + virtual bool DoDecode() const { return true; } // Hook to handle encode/decode mismatch virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2); diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 45138f14b..6b019b2bf 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -30,7 +30,7 @@ class ErrorResilienceTestLarge Reset(); } - virtual ~ErrorResilienceTestLarge() {} + ~ErrorResilienceTestLarge() override = default; void Reset() { error_nframes_ = 0; @@ -38,19 +38,19 @@ class ErrorResilienceTestLarge pattern_switch_ = 0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = 0.0; nframes_ = 0; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } @@ -90,7 +90,7 @@ class ErrorResilienceTestLarge return frame_flags; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video) override { frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. @@ -129,21 +129,21 @@ class ErrorResilienceTestLarge return 0.0; } - virtual bool DoDecode() const { + bool DoDecode() const override { if (error_nframes_ > 0 && (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { for (unsigned int i = 0; i < error_nframes_; ++i) { if (error_frames_[i] == nframes_ - 1) { std::cout << " Skipping decoding frame: " << error_frames_[i] << "\n"; - return 0; + return false; } } } - return 1; + return true; } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; @@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls Reset(); } - virtual ~ErrorResilienceTestLargeCodecControls() {} + ~ErrorResilienceTestLargeCodecControls() override = default; void Reset() { last_pts_ = 0; @@ -393,7 +393,7 @@ class ErrorResilienceTestLargeCodecControls duration_ = 0.0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } @@ -460,8 +460,8 @@ class ErrorResilienceTestLargeCodecControls return layer_id; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (cfg_.ts_number_layers > 1) { int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers); @@ -476,7 +476,7 @@ class ErrorResilienceTestLargeCodecControls } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; if (duration > 1) { @@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls ++tot_frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { duration_ = (last_pts_ + 1) * timebase_; if (cfg_.ts_number_layers > 1) { for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers); diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 3bd4a1c47..7b9a836fb 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -210,13 +210,12 @@ class ExternalFrameBufferMD5Test : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)), md5_file_(nullptr), num_buffers_(0) {} - virtual ~ExternalFrameBufferMD5Test() { + ~ExternalFrameBufferMD5Test() override { if (md5_file_ != nullptr) fclose(md5_file_); } - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (num_buffers_ > 0 && video.frame_number() == 0) { // Have libvpx use frame buffers we create. ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); @@ -232,8 +231,8 @@ class ExternalFrameBufferMD5Test << "Md5 file open failed. Filename: " << md5_file_name_; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -289,7 +288,7 @@ class ExternalFrameBufferTest : public ::testing::Test { ExternalFrameBufferTest() : video_(nullptr), decoder_(nullptr), num_buffers_(0) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); ASSERT_NE(video_, nullptr); video_->Init(); @@ -300,7 +299,7 @@ class ExternalFrameBufferTest : public ::testing::Test { ASSERT_NE(decoder_, nullptr); } - virtual void TearDown() { + void TearDown() override { delete decoder_; decoder_ = nullptr; delete video_; @@ -355,7 +354,7 @@ class ExternalFrameBufferTest : public ::testing::Test { class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { protected: - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); ASSERT_NE(video_, nullptr); video_->Init(); diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 83d1ff142..3cdf909d4 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -132,9 +132,18 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck(). +// See: +// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif class FwdTrans8x8TestBase { public: - virtual ~FwdTrans8x8TestBase() {} + virtual ~FwdTrans8x8TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -170,7 +179,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff255; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-255, 255] at index " << j @@ -201,7 +210,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff15; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-15, 15] at index " << j @@ -275,11 +284,11 @@ class FwdTrans8x8TestBase { } } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual" << " roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip " << "error > 1/5 per block"; } @@ -360,17 +369,17 @@ class FwdTrans8x8TestBase { total_coeff_error += abs(coeff_diff); } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has" - << "an individual roundtrip error > 1"; + << " an individual roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average" << " roundtrip error > 1/5 per block"; - EXPECT_EQ(0, total_coeff_error) + ASSERT_EQ(0, total_coeff_error) << "Error: Extremal 8x8 FDCT/FHT has" - << "overflow issues in the intermediate steps > 1"; + << " overflow issues in the intermediate steps > 1"; } } @@ -426,7 +435,7 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - src[j]; #endif const uint32_t error = diff * diff; - EXPECT_GE(1u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(1u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 IDCT has error " << error << " at index " << j; } } @@ -456,7 +465,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < kNumCoeffs; ++j) { const int32_t diff = coeff[j] - coeff_r[j]; const uint32_t error = diff * diff; - EXPECT_GE(9u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(9u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 DCT has error " << error << " at index " << j; } } @@ -512,7 +521,7 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - ref[j]; #endif const uint32_t error = diff * diff; - EXPECT_EQ(0u, error) + ASSERT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error << " at index " << j; } } @@ -523,13 +532,16 @@ class FwdTrans8x8TestBase { vpx_bit_depth_t bit_depth_; int mask_; }; +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam<Dct8x8Param> { public: - virtual ~FwdTrans8x8DCT() {} + ~FwdTrans8x8DCT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -539,13 +551,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -566,9 +578,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } class FwdTrans8x8HT : public FwdTrans8x8TestBase, public ::testing::TestWithParam<Ht8x8Param> { public: - virtual ~FwdTrans8x8HT() {} + ~FwdTrans8x8HT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -578,13 +590,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -602,9 +614,9 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } class InvTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam<Idct8x8Param> { public: - virtual ~InvTrans8x8DCT() {} + ~InvTrans8x8DCT() override = default; - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -613,13 +625,14 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } - void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {} + void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, + int /*stride*/) override {} IdctFunc ref_txfm_; IdctFunc inv_txfm_; diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index 8a0eb71ba..7b6c29a88 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -65,7 +65,7 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder { ASSERT_EQ(expected_err, res) << EncoderError(); } - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else @@ -79,22 +79,22 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, protected: VP9FrameSizeTestsLarge() : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {} - virtual ~VP9FrameSizeTestsLarge() {} + ~VP9FrameSizeTestsLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index f904e814a..b22bae87c 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -130,13 +130,19 @@ std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> { public: - virtual void SetUp() { + void SetUp() override { h_func_ = GetParam().func; bwh_ = GetParam().block_size; block_size_ = bwh_ * bwh_; rnd_.Reset(ACMRandom::DeterministicSeed()); } + // The Rand() function generates values in the range [-((1 << BitDepth) - 1), + // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform + // is the residual pixel, which is defined as 'source pixel - predicted + // pixel'. Source pixel and predicted pixel take values in the range + // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from + // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1). virtual int16_t Rand() = 0; void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, @@ -170,6 +176,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> { EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); } + void ExtremeValuesTest() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < 2; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + const int sign = (i == 0) ? 1 : -1; + for (int j = 0; j < kMaxBlockSize; ++j) + input_extreme_block[j] = sign * 255; + + ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + void VaryStride() { const int kMaxBlockSize = 32 * 32; DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); @@ -220,11 +251,18 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> { class HadamardLowbdTest : public HadamardTestBase { protected: - virtual int16_t Rand() { return rnd_.Rand9Signed(); } + // Use values between -255 (0xFF01) and 255 (0x00FF) + int16_t Rand() override { + int16_t src = rnd_.Rand8(); + int16_t pred = rnd_.Rand8(); + return src - pred; + } }; TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } +TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); } + TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } TEST_P(HadamardLowbdTest, DISABLED_Speed) { @@ -296,7 +334,12 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH class HadamardHighbdTest : public HadamardTestBase { protected: - virtual int16_t Rand() { return rnd_.Rand13Signed(); } + // Use values between -4095 (0xF001) and 4095 (0x0FFF) + int16_t Rand() override { + int16_t src = rnd_.Rand12(); + int16_t pred = rnd_.Rand12(); + return src - pred; + } }; TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } @@ -324,5 +367,14 @@ INSTANTIATE_TEST_SUITE_P( 32))); #endif // HAVE_AVX2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon, + 32))); +#endif + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/test/idct_test.cc b/test/idct_test.cc index 1b9532e1c..279e58e2a 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -27,7 +27,7 @@ using libvpx_test::Buffer; class IDCTTest : public ::testing::TestWithParam<IdctFunc> { protected: - virtual void SetUp() { + void SetUp() override { UUT = GetParam(); input = new Buffer<int16_t>(4, 4, 0); @@ -41,7 +41,7 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> { ASSERT_TRUE(output->Init()); } - virtual void TearDown() { + void TearDown() override { delete input; delete predict; delete output; diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc new file mode 100644 index 000000000..f66f00b5c --- /dev/null +++ b/test/init_vpx_test.cc @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/init_vpx_test.h" + +#include "./vpx_config.h" + +#if !CONFIG_SHARED +#include <string> +#include "third_party/googletest/src/include/gtest/gtest.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +extern "C" { +#if CONFIG_VP8 +extern void vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 +extern void vp9_rtcd(); +#endif // CONFIG_VP9 +extern void vpx_dsp_rtcd(); +extern void vpx_scale_rtcd(); +} + +#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +static void append_negative_gtest_filter(const char *str) { + std::string filter = GTEST_FLAG_GET(filter); + // Negative patterns begin with one '-' followed by a ':' separated list. + if (filter.find('-') == std::string::npos) filter += '-'; + filter += str; + GTEST_FLAG_SET(filter, filter); +} +#endif // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // !CONFIG_SHARED + +namespace libvpx_test { +void init_vpx_test() { +#if !CONFIG_SHARED +#if VPX_ARCH_AARCH64 + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON_DOTPROD)) { + append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); + } + if (!(caps & HAS_NEON_I8MM)) { + append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); + } + if (!(caps & HAS_SVE)) { + append_negative_gtest_filter(":SVE.*:SVE/*"); + } +#elif VPX_ARCH_ARM + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); +#endif // VPX_ARCH_ARM + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + const int simd_caps = x86_simd_caps(); + if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); + if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); + if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); + if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); + if (!(simd_caps & HAS_SSSE3)) { + append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); + } + if (!(simd_caps & HAS_SSE4_1)) { + append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); + } + if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); + if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + + // Shared library builds don't support whitebox tests that exercise internal + // symbols. +#if CONFIG_VP8 + vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 + vp9_rtcd(); +#endif // CONFIG_VP9 + vpx_dsp_rtcd(); + vpx_scale_rtcd(); +#endif // !CONFIG_SHARED +} +} // namespace libvpx_test diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h new file mode 100644 index 000000000..5e0dbb0e7 --- /dev/null +++ b/test/init_vpx_test.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_INIT_VPX_TEST_H_ +#define TEST_INIT_VPX_TEST_H_ + +namespace libvpx_test { +void init_vpx_test(); +} + +#endif // TEST_INIT_VPX_TEST_H_ diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 762d585f5..c37dc0d48 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -40,7 +40,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, protected: InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {} - virtual ~InvalidFileTest() { + ~InvalidFileTest() override { if (res_file_ != nullptr) fclose(res_file_); } @@ -50,10 +50,9 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, << "Result file open failed. Filename: " << res_file_name_; } - virtual bool HandleDecodeResult( - const vpx_codec_err_t res_dec, - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { EXPECT_NE(res_file_, nullptr); int expected_res_dec; @@ -172,9 +171,9 @@ VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest, class InvalidFileInvalidPeekTest : public InvalidFileTest { protected: InvalidFileInvalidPeekTest() : InvalidFileTest() {} - virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, - libvpx_test::CompressedVideoSource * /*video*/, - const vpx_codec_err_t /*res_peek*/) {} + void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource * /*video*/, + const vpx_codec_err_t /*res_peek*/) override {} }; TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index a8ac4f154..3ccac62b5 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -33,19 +33,19 @@ class IVFVideoSource : public CompressedVideoSource { compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), end_of_file_(false) {} - virtual ~IVFVideoSource() { + ~IVFVideoSource() override { delete[] compressed_frame_buf_; if (input_file_) fclose(input_file_); } - virtual void Init() { + void Init() override { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; } - virtual void Begin() { + void Begin() override { input_file_ = OpenTestDataFile(file_name_); ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; @@ -62,7 +62,7 @@ class IVFVideoSource : public CompressedVideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } @@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource { } } - virtual const uint8_t *cxdata() const { + const uint8_t *cxdata() const override { return end_of_file_ ? nullptr : compressed_frame_buf_; } - virtual size_t frame_size() const { return frame_sz_; } - virtual unsigned int frame_number() const { return frame_; } + size_t frame_size() const override { return frame_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index a13dec9ce..5292bb188 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -8,12 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ #include <climits> +#include <cstring> #include <vector> #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "./vpx_config.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" namespace { @@ -22,9 +28,9 @@ class KeyframeTest public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: KeyframeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~KeyframeTest() {} + ~KeyframeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); kf_count_ = 0; @@ -33,8 +39,8 @@ class KeyframeTest set_cpu_used_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (kf_do_force_kf_) { frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; } @@ -43,7 +49,7 @@ class KeyframeTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { kf_pts_list_.push_back(pkt->data.frame.pts); kf_count_++; @@ -146,4 +152,105 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { } VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES); + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w, + unsigned int h) { + vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface, + unsigned long deadline, + unsigned int kf_max_dist) { + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + const int speed = IsVP9(iface) ? 9 : -12; + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK); + + vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const vpx_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK); + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK); + got_data = false; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(KeyframeIntervalTest, KeyframeMaximumInterval) { + std::vector<vpx_codec_iface_t *> ifaces; +#if CONFIG_VP8_ENCODER + ifaces.push_back(vpx_codec_vp8_cx()); +#endif +#if CONFIG_VP9_ENCODER + ifaces.push_back(vpx_codec_vp9_cx()); +#endif + for (vpx_codec_iface_t *iface : ifaces) { + for (unsigned long deadline : + { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples + // of 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyframeMaximumInterval(iface, deadline, kf_max_dist); + } + } + } +} + } // namespace diff --git a/test/level_test.cc b/test/level_test.cc index 038d75f44..36cfd645c 100644 --- a/test/level_test.cc +++ b/test/level_test.cc @@ -22,9 +22,9 @@ class LevelTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0), level_(0) {} - virtual ~LevelTest() {} + ~LevelTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -41,8 +41,8 @@ class LevelTest cfg_.rc_min_quantizer = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_); @@ -120,7 +120,7 @@ TEST_P(LevelTest, TestTargetLevel255) { TEST_P(LevelTest, TestTargetLevelApi) { ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1); - static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; + static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; vpx_codec_ctx_t enc; vpx_codec_enc_cfg_t cfg; EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0)); diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 4cc99a6db..ce0ddeae1 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -129,15 +129,15 @@ uint8_t GetHevThresh(ACMRandom *rnd) { class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> { public: - virtual ~Loop8Test6Param() {} - virtual void SetUp() { + ~Loop8Test6Param() override = default; + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; @@ -151,15 +151,15 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> { public: - virtual ~Loop8Test9Param() {} - virtual void SetUp() { + ~Loop8Test9Param() override = default; + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; diff --git a/test/minmax_test.cc b/test/minmax_test.cc index 12327bc18..b49570906 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" #include "test/acm_random.h" #include "test/register_state_check.h" @@ -28,7 +29,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b, class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> { public: - virtual void SetUp() { + void SetUp() override { mm_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } @@ -115,7 +116,115 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { } } +#if CONFIG_VP9_HIGHBITDEPTH + +using HBDMinMaxTest = MinMaxTest; + +void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min_ret, int *max_ret) { + int min = 65535; + int max = 0; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]); + if (min > diff) min = diff; + if (max < diff) max = diff; + } + } + + *min_ret = min; + *max_ret = max; +} + +TEST_P(HBDMinMaxTest, MinValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(65535, max); + EXPECT_EQ(i, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, MaxValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(i, max); + EXPECT_EQ(0, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, CompareReference) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t)))); + for (int j = 0; j < 64; j++) { + CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16(); + } + + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); + EXPECT_EQ(max_ref, max); + EXPECT_EQ(min_ref, min); +} + +TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + for (int i = 0; i < 8 * 64; i++) { + CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16(); + } + for (int a_stride = 8; a_stride <= 64; a_stride += 8) { + for (int b_stride = 8; b_stride <= 64; b_stride += 8) { + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + } + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} +#endif + INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_c)); +#endif #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, @@ -125,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_neon)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_neon)); +#endif #endif #if HAVE_MSA diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 7eb888a58..01e63eb69 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -59,8 +59,8 @@ const int kCountTestBlock = 1000; class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { public: - virtual ~PartialIDctTest() {} - virtual void SetUp() { + ~PartialIDctTest() override = default; + void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); fwd_txfm_ = GET_PARAM(0); full_inv_txfm_ = GET_PARAM(1); @@ -76,7 +76,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { case TX_8X8: size_ = 8; break; case TX_16X16: size_ = 16; break; case TX_32X32: size_ = 32; break; - default: FAIL() << "Wrong Size!"; break; + default: FAIL() << "Wrong Size!"; } // Randomize stride_ to a value less than or equal to 1024 @@ -100,7 +100,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { vpx_memalign(16, pixel_size_ * output_block_size_)); } - virtual void TearDown() { + void TearDown() override { vpx_free(input_block_); input_block_ = nullptr; vpx_free(output_block_); diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 27d5ffa90..d2db8a7c7 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -51,10 +51,10 @@ class VpxPostProcDownAndAcrossMbRowTest public: VpxPostProcDownAndAcrossMbRowTest() : mb_post_proc_down_and_across_(GetParam()) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_; // Size of the underlying data block that will be filtered. @@ -227,10 +227,10 @@ class VpxMbPostProcAcrossIpTest VpxMbPostProcAcrossIpTest() : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()), src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; void SetCols(unsigned char *s, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { @@ -356,10 +356,10 @@ class VpxMbPostProcDownTest : rows_(16), cols_(16), mb_post_proc_down_(GetParam()), src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { diff --git a/test/predict_test.cc b/test/predict_test.cc index 747297057..474eab2cb 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -43,7 +43,7 @@ class PredictTestBase : public AbstractBench, : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {} - virtual void SetUp() { + void SetUp() override { src_ = new uint8_t[kSrcSize]; ASSERT_NE(src_, nullptr); @@ -64,7 +64,7 @@ class PredictTestBase : public AbstractBench, memset(dst_c_, 0, 16 * 16); } - virtual void TearDown() { + void TearDown() override { delete[] src_; src_ = nullptr; vpx_free(padded_dst_); @@ -209,7 +209,7 @@ class PredictTestBase : public AbstractBench, } } - void Run() { + void Run() override { for (int xoffset = 0; xoffset < 8; ++xoffset) { for (int yoffset = 0; yoffset < 8; ++yoffset) { if (xoffset == 0 && yoffset == 0) { @@ -350,6 +350,14 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); #endif +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx), + make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx), + make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 57309e810..ab38f5c1b 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -121,13 +121,13 @@ class QuantizeTest : public QuantizeTestBase, public ::testing::TestWithParam<VP8QuantizeParam>, public AbstractBench { protected: - virtual void SetUp() { + void SetUp() override { SetupCompressor(); asm_quant_ = GET_PARAM(0); c_quant_ = GET_PARAM(1); } - virtual void Run() { + void Run() override { asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); } diff --git a/test/realtime_test.cc b/test/realtime_test.cc index c5de2dcb3..a9870b3cb 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -26,7 +26,7 @@ class RealtimeTest public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {} - ~RealtimeTest() override {} + ~RealtimeTest() override = default; void SetUp() override { InitializeConfig(); @@ -95,7 +95,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } TEST_P(RealtimeTest, IntegerOverflowLarge) { if (IsVP9()) { -#if VPX_ARCH_X86_64 +#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64 TestIntegerOverflow(16384, 16384); #else TestIntegerOverflow(4096, 4096); diff --git a/test/register_state_check.h b/test/register_state_check.h index 0b837dd04..ede86ef52 100644 --- a/test/register_state_check.h +++ b/test/register_state_check.h @@ -184,13 +184,13 @@ class RegisterStateCheckMMX { uint16_t pre_fpu_env_[14]; }; -#define API_REGISTER_STATE_CHECK(statement) \ - do { \ - { \ - libvpx_test::RegisterStateCheckMMX reg_check; \ - ASM_REGISTER_STATE_CHECK(statement); \ - } \ - __asm__ volatile("" ::: "memory"); \ +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheckMMX reg_check_mmx; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } \ + __asm__ volatile("" ::: "memory"); \ } while (false) } // namespace libvpx_test diff --git a/test/resize_test.cc b/test/resize_test.cc index d9420a454..20ad2229b 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -244,10 +244,10 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { } bool flag_codec_; bool smaller_width_larger_size_; - virtual ~ResizingVideoSource() {} + ~ResizingVideoSource() override = default; protected: - virtual void Next() { + void Next() override { ++frame_; unsigned int width = 0; unsigned int height = 0; @@ -264,14 +264,14 @@ class ResizeTest protected: ResizeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeTest() {} + ~ResizeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0); ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0); encode_frame_width_.push_back(pkt->data.frame.width[0]); @@ -286,8 +286,8 @@ class ResizeTest return encode_frame_height_[idx]; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } @@ -333,15 +333,15 @@ class ResizeInternalTest : public ResizeTest { ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeInternalTest() {} + ~ResizeInternalTest() override = default; - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp90-2-05-resize.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) @@ -352,8 +352,8 @@ class ResizeInternalTest : public ResizeTest { #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (change_config_) { int new_q = 60; if (video->frame() == 0) { @@ -378,13 +378,13 @@ class ResizeInternalTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -447,10 +447,10 @@ class ResizeRealtimeTest public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> { protected: ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeRealtimeTest() {} + ~ResizeRealtimeTest() override = default; - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_AQ_MODE, 3); encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); @@ -463,24 +463,24 @@ class ResizeRealtimeTest } } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0); ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0); encode_frame_width_.push_back(pkt->data.frame.width[0]); @@ -688,15 +688,15 @@ class ResizeCspTest : public ResizeTest { ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeCspTest() {} + ~ResizeCspTest() override = default; - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) @@ -707,8 +707,8 @@ class ResizeCspTest : public ResizeTest { #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 && cfg_.g_profile != 1) { cfg_.g_profile = 1; @@ -721,13 +721,13 @@ class ResizeCspTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -753,10 +753,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource { limit_ = 30; } - virtual ~ResizingCspVideoSource() {} + ~ResizingCspVideoSource() override = default; protected: - virtual void Next() { + void Next() override { ++frame_; SetImageFormat(CspForFrameNumber(frame_)); FillFrame(); diff --git a/test/sad_test.cc b/test/sad_test.cc index 0896c77f1..3530e6605 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); typedef TestParams<SadMxNFunc> SadMxNParam; +typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam; + typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); @@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, unsigned int *sad_array); typedef TestParams<SadMxNx4Func> SadMxNx4Param; +typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param; + typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); @@ -64,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> { public: explicit SADTestBase(const ParamType ¶ms) : params_(params) {} - virtual void SetUp() { + void SetUp() override { source_data8_ = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data8_ = reinterpret_cast<uint8_t *>( @@ -99,7 +108,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> { rnd_.Reset(ACMRandom::DeterministicSeed()); } - virtual void TearDown() { + void TearDown() override { vpx_free(source_data8_); source_data8_ = nullptr; vpx_free(reference_data8_); @@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> { return sad; } + // Sum of Absolute Differences Skip rows. Given two blocks, calculate the + // absolute difference between two pixels in the same relative location every + // other row; accumulate and double the result at the end. + uint32_t ReferenceSADSkip(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; h += 2) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad * 2; + } + // Sum of Absolute Differences Average. Given two blocks, and a prediction // calculate the absolute difference between one pixel and average of the // corresponding and predicted pixels; accumulate. @@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> { } }; +class SADSkipx4Test : public SADTestBase<SadMxNx4Param> { + public: + SADSkipx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADSkip(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> { public: SADTest() : SADTestBase(GetParam()) {} @@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> { } }; +class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> { + public: + SADSkipTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) { PrintMedian(title); } +TEST_P(SADSkipTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADSkipTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADSkipTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); @@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) { reference_stride_ = tmp_stride; } +TEST_P(SADSkipx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADSkipx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); +const SadSkipMxNParam skip_c_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests)); + const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c), @@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); +const SadSkipMxNx4Param skip_x4d_c_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_c_tests)); + //------------------------------------------------------------------------------ // ARM functions #if HAVE_NEON @@ -787,6 +1129,95 @@ const SadMxNParam neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +#if HAVE_NEON_DOTPROD +const SadMxNParam neon_dotprod_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod), + SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod), + SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod), + SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod), + SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod), + SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod), + SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod), + SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest, + ::testing::ValuesIn(neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNParam skip_neon_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon), + SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon), + SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, + ::testing::ValuesIn(skip_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadSkipMxNParam skip_neon_dotprod_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest, + ::testing::ValuesIn(skip_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), @@ -845,6 +1276,21 @@ const SadMxNAvgParam avg_neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); +#if HAVE_NEON_DOTPROD +const SadMxNAvgParam avg_neon_dotprod_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest, + ::testing::ValuesIn(avg_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), @@ -899,6 +1345,92 @@ const SadMxNx4Param x4d_neon_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNx4Param x4d_neon_dotprod_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test, + ::testing::ValuesIn(x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNx4Param skip_x4d_neon_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon), + SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon), + SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_tests)); + +#if HAVE_NEONE_DOTPROD +const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD #endif // HAVE_NEON //------------------------------------------------------------------------------ @@ -956,6 +1488,54 @@ const SadMxNParam sse2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); +const SadSkipMxNParam skip_sse2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest, + ::testing::ValuesIn(skip_sse2_tests)); + const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2), @@ -1065,6 +1645,57 @@ const SadMxNx4Param x4d_sse2_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadSkipMxNx4Param skip_x4d_sse2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_sse2_tests)); #endif // HAVE_SSE2 #if HAVE_SSE3 @@ -1113,6 +1744,44 @@ const SadMxNParam avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); +const SadSkipMxNParam skip_avx2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest, + ::testing::ValuesIn(skip_avx2_tests)); + const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2), @@ -1180,6 +1849,42 @@ const SadMxNx4Param x4d_avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +const SadSkipMxNx4Param skip_x4d_avx2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx2_tests)); + #endif // HAVE_AVX2 #if HAVE_AVX512 diff --git a/test/set_roi.cc b/test/set_roi.cc index 167cf908f..693410e39 100644 --- a/test/set_roi.cc +++ b/test/set_roi.cc @@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) { // Initialize elements of cpi with valid defaults. VP8_COMP cpi; - cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA; + cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA; cpi.cyclic_refresh_mode_enabled = 0; cpi.mb.e_mbd.segmentation_enabled = 0; cpi.mb.e_mbd.update_mb_segmentation_map = 0; diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index df6da8403..725d5eb85 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -21,9 +21,14 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using ::testing::Combine; +using ::testing::Range; +using ::testing::ValuesIn; namespace { const int kNumIterations = 10000; @@ -33,13 +38,13 @@ typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam; class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> { public: - virtual ~SumSquaresTest() {} - virtual void SetUp() { + ~SumSquaresTest() override = default; + void SetUp() override { ref_func_ = GET_PARAM(0); tst_func_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: SSI16Func ref_func_; @@ -126,4 +131,210 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_msa))); #endif // HAVE_MSA + +typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height); + +struct TestSSEFuncs { + TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0) + : ref_func(ref), tst_func(tst), bit_depth(depth) {} + SSEFunc ref_func; // Pointer to reference function + SSEFunc tst_func; // Pointer to tested function + int bit_depth; +}; + +typedef std::tuple<TestSSEFuncs, int> SSETestParam; + +class SSETest : public ::testing::TestWithParam<SSETestParam> { + public: + ~SSETest() override = default; + void SetUp() override { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + is_hbd_ = +#if CONFIG_VP9_HIGHBITDEPTH + params_.ref_func == vpx_highbd_sse_c; +#else + false; +#endif + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2)); + ref_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2)); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + vpx_free(ref_); + } + void RunTest(bool is_random, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + uint16_t *src16 = reinterpret_cast<uint16_t *>(src_); + uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + src_[ii * stride + jj] = rnd_.Rand8(); + ref_[ii * stride + jj] = rnd_.Rand8(); + } else { + src16[ii * stride + jj] = rnd_(limit); + ref16[ii * stride + jj] = rnd_(limit); + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int16_t val) { + uint16_t *data16 = reinterpret_cast<uint16_t *>(data); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + data[ii * stride + jj] = static_cast<uint8_t>(val); + } else { + data16[ii * stride + jj] = val; + } + } + } + } + + protected: + bool is_hbd_; + int width_; + TestSSEFuncs params_; + uint8_t *src_; + uint8_t *ref_; + ACMRandom rnd_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest); + +void SSETest::RunTest(bool is_random, int width, int height, int run_times) { + int failed = 0; + vpx_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (is_random) { + GenRandomData(width, height, stride); + } else { + const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, 0); + GenExtremeData(width, height, stride, ref_, limit); + } else { + GenExtremeData(width, height, stride, src_, limit); + GenExtremeData(width, height, stride, ref_, 0); + } + } + int64_t res_ref, res_tst; + uint8_t *src = src_; + uint8_t *ref = ref_; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_hbd_) { + src = CONVERT_TO_BYTEPTR(src_); + ref = CONVERT_TO_BYTEPTR(ref_); + } +#endif + res_ref = params_.ref_func(src, stride, ref, stride, width, height); + res_tst = params_.tst_func(src, stride, ref, stride, width, height); + if (run_times > 1) { + vpx_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(vpx_usec_timer_elapsed(&ref_timer)); + + vpx_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast<int>(vpx_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } + } + } +} + +TEST_P(SSETest, OperationCheck) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSETest, ExtremeValues) { + for (int height = 4; height <= 128; height += 4) { + RunTest(false, width_, height, 1); + } +} + +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 100); + } +} + +#if HAVE_NEON +TestSSEFuncs sse_neon[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon) +#endif +}; +INSTANTIATE_TEST_SUITE_P(NEON, SSETest, + Combine(ValuesIn(sse_neon), Range(4, 129, 4))); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +TestSSEFuncs sse_neon_dotprod[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest, + Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SSE4_1 +TestSSEFuncs sse_sse4[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1) +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest, + Combine(ValuesIn(sse_sse4), Range(4, 129, 4))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +TestSSEFuncs sse_avx2[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2) +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SSETest, + Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); +#endif // HAVE_AVX2 } // namespace diff --git a/test/superframe_test.cc b/test/superframe_test.cc index a5c92e914..4c3aa1625 100644 --- a/test/superframe_test.cc +++ b/test/superframe_test.cc @@ -28,9 +28,9 @@ class SuperframeTest protected: SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} - virtual ~SuperframeTest() {} + ~SuperframeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); const SuperframeTestParam input = GET_PARAM(1); const libvpx_test::TestMode mode = std::get<kTestMode>(input); @@ -39,17 +39,17 @@ class SuperframeTest sf_count_max_ = INT_MAX; } - virtual void TearDown() { delete[] modified_buf_; } + void TearDown() override { delete[] modified_buf_; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); } } - virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( - const vpx_codec_cx_pkt_t *pkt) { + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf); diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 484252ca4..aff4ace84 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } protected: - virtual ~DatarateOnePassCbrSvc() {} + ~DatarateOnePassCbrSvc() override = default; virtual void ResetModel() { last_pts_ = 0; @@ -86,7 +86,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } ksvc_flex_noupd_tlenh_ = false; } - virtual void BeginPassHook(unsigned int /*pass*/) {} + void BeginPassHook(unsigned int /*pass*/) override {} // Example pattern for spatial layers and 2 temporal layers used in the // bypass/flexible mode. The pattern corresponds to the pattern @@ -179,8 +179,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); if (video->frame() == 0) { @@ -256,13 +256,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 1); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (update_pattern_ && video->frame() >= 100) { @@ -277,13 +277,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 0); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (change_bitrate_ && video->frame() == 200) { @@ -468,7 +468,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { return VPX_CODEC_OK; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { uint32_t sizes[8] = { 0 }; uint32_t sizes_parsed[8] = { 0 }; int count = 0; @@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void EndPassHook() { + void EndPassHook() override { if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; duration_ = (last_pts_ + 1) * timebase_; for (int sl = 0; sl < number_spatial_layers_; ++sl) { @@ -583,7 +583,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { // TODO(marpan): Look into why an assert is triggered in compute_psnr // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh. // Has to do with dropped frames in bypass/flexible svc mode. @@ -611,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { bool single_layer_resize_; unsigned int top_sl_width_; unsigned int top_sl_height_; - vpx_svc_ref_frame_config_t ref_frame_config; + vpx_svc_ref_frame_config_t ref_frame_config_; int update_pattern_; bool change_bitrate_; vpx_codec_pts_t last_pts_ref_; @@ -639,7 +639,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { bool ksvc_flex_noupd_tlenh_; private: - virtual void SetConfig(const int num_temporal_layer) { + void SetConfig(const int num_temporal_layer) override { cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; cfg_.g_error_resilient = 1; @@ -670,10 +670,10 @@ class DatarateOnePassCbrSvcSingleBR DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcSingleBR() {} + ~DatarateOnePassCbrSvcSingleBR() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1160,10 +1160,10 @@ class DatarateOnePassCbrSvcMultiBR DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcMultiBR() {} + ~DatarateOnePassCbrSvcMultiBR() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1243,10 +1243,10 @@ class DatarateOnePassCbrSvcFrameDropMultiBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {} + ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1355,10 +1355,10 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {} + ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1441,10 +1441,10 @@ class DatarateOnePassCbrSvcDenoiser DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcDenoiser() {} + ~DatarateOnePassCbrSvcDenoiser() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1499,10 +1499,10 @@ class DatarateOnePassCbrSvcSmallKF DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcSmallKF() {} + ~DatarateOnePassCbrSvcSmallKF() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1702,10 +1702,10 @@ class DatarateOnePassCbrSvcPostencodeDrop DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcPostencodeDrop() {} + ~DatarateOnePassCbrSvcPostencodeDrop() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index 7300ce667..b4337ae75 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -45,19 +45,19 @@ class ScalePartitionOnePassCbrSvc } protected: - virtual ~ScalePartitionOnePassCbrSvc() {} + ~ScalePartitionOnePassCbrSvc() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -67,12 +67,12 @@ class ScalePartitionOnePassCbrSvc num_nonref_frames_++; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { ++mismatch_nframes_; } - virtual void SetConfig(const int /*num_temporal_layer*/) {} + void SetConfig(const int /*num_temporal_layer*/) override {} unsigned int GetMismatchFrames() const { return mismatch_nframes_; } unsigned int GetNonRefFrames() const { return num_nonref_frames_; } @@ -129,14 +129,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } protected: - virtual ~SyncFrameOnePassCbrSvc() {} + ~SyncFrameOnePassCbrSvc() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual bool DoDecode() const { + bool DoDecode() const override { return current_video_frame_ >= frame_to_start_decode_; } @@ -225,8 +225,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { current_video_frame_ = video->frame(); PreEncodeFrameHookSetup(video, encoder); if (video->frame() == 0) { @@ -265,8 +265,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } #if CONFIG_VP9_DECODER - virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { if (video->frame() < frame_to_sync_) { if (decode_to_layer_before_sync_ >= 0) decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, @@ -284,7 +284,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } #endif - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -307,8 +307,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_; } @@ -331,7 +331,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, vpx_svc_ref_frame_config_t ref_frame_config_; private: - virtual void SetConfig(const int num_temporal_layer) { + void SetConfig(const int num_temporal_layer) override { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -657,15 +657,15 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, } protected: - virtual ~LoopfilterOnePassCbrSvc() {} + ~LoopfilterOnePassCbrSvc() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { // Consider 3 cases: @@ -694,7 +694,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -704,12 +704,12 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, num_nonref_frames_++; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { ++mismatch_nframes_; } - virtual void SetConfig(const int /*num_temporal_layer*/) {} + void SetConfig(const int /*num_temporal_layer*/) override {} int GetMismatchFrames() const { return mismatch_nframes_; } int GetNonRefFrames() const { return num_nonref_frames_; } diff --git a/test/svc_test.h b/test/svc_test.h index f1d727fd9..0026372de 100644 --- a/test/svc_test.h +++ b/test/svc_test.h @@ -36,7 +36,7 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest { } protected: - virtual ~OnePassCbrSvc() {} + ~OnePassCbrSvc() override {} virtual void SetConfig(const int num_temporal_layer) = 0; @@ -46,11 +46,11 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest { virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder); - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder); + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override; virtual void AssignLayerBitrates(); - virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {} + void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {} vpx_svc_extra_cfg_t svc_params_; int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; diff --git a/test/test-data.mk b/test/test-data.mk index 62a9d6ef1..9eabffae3 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -29,6 +29,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m # Test vectors LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 55f92a25d..a9decc6b6 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -870,3 +870,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv 8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv +ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m diff --git a/test/test.mk b/test/test.mk index f60d8f823..d4521f08b 100644 --- a/test/test.mk +++ b/test/test.mk @@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h LIBVPX_TEST_SRCS-yes += md5_helper.h LIBVPX_TEST_SRCS-yes += register_state_check.h LIBVPX_TEST_SRCS-yes += test.mk +LIBVPX_TEST_SRCS-yes += init_vpx_test.cc +LIBVPX_TEST_SRCS-yes += init_vpx_test.h LIBVPX_TEST_SRCS-yes += test_libvpx.cc LIBVPX_TEST_SRCS-yes += test_vectors.cc LIBVPX_TEST_SRCS-yes += test_vectors.h @@ -22,10 +24,6 @@ LIBVPX_TEST_SRCS-yes += ../md5_utils.h ../md5_utils.c LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += alt_ref_aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp8_datarate_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp9_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h @@ -37,6 +35,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += yuv_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc @@ -44,6 +43,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc @@ -58,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h @@ -85,6 +87,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc +$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) endif LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc @@ -179,7 +182,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON))) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc @@ -214,6 +217,8 @@ endif TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 28b3484a0..4c464a262 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -14,9 +14,11 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "test/clear_system_state.h" +#include "test/init_vpx_test.h" #include "test/md5_helper.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -269,28 +271,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, - vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, + vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, - vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, + vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, - nullptr, vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, + vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon, + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, - nullptr, vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, + vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon, + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA @@ -344,6 +350,15 @@ INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, vpx_tm_predictor_32x32_vsx) #endif // HAVE_VSX +#if HAVE_LSX +INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr) +INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +#endif // HAVE_LSX + // ----------------------------------------------------------------------------- #if CONFIG_VP9_HIGHBITDEPTH @@ -561,37 +576,41 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, - vpx_highbd_tm_predictor_4x4_neon) + vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, + vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon, + vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, - vpx_highbd_tm_predictor_8x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_neon, - vpx_highbd_dc_left_predictor_16x16_neon, - vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, - vpx_highbd_v_predictor_16x16_neon, - vpx_highbd_h_predictor_16x16_neon, - vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, - nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_neon, - vpx_highbd_dc_left_predictor_32x32_neon, - vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, - vpx_highbd_v_predictor_32x32_neon, - vpx_highbd_h_predictor_32x32_neon, - vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, - nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, + vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon, + vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, + vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon, + vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, + vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon, + vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH -#include "test/test_libvpx.cc" +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ::libvpx_test::init_vpx_test(); + return RUN_ALL_TESTS(); +} diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index 222a83f8c..c1798b8b8 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -7,69 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <string> +#include "test/init_vpx_test.h" #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 -#include "vpx_ports/x86.h" -#endif -extern "C" { -#if CONFIG_VP8 -extern void vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 -extern void vp9_rtcd(); -#endif // CONFIG_VP9 -extern void vpx_dsp_rtcd(); -extern void vpx_scale_rtcd(); -} - -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 -static void append_negative_gtest_filter(const char *str) { - std::string filter = ::testing::FLAGS_gtest_filter; - // Negative patterns begin with one '-' followed by a ':' separated list. - if (filter.find('-') == std::string::npos) filter += '-'; - filter += str; - ::testing::FLAGS_gtest_filter = filter; -} -#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 - int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 - const int simd_caps = x86_simd_caps(); - if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); - if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); - if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); - if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); - if (!(simd_caps & HAS_SSSE3)) { - append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); - } - if (!(simd_caps & HAS_SSE4_1)) { - append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); - } - if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); - if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); - if (!(simd_caps & HAS_AVX512)) { - append_negative_gtest_filter(":AVX512.*:AVX512/*"); - } -#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 - -#if !CONFIG_SHARED -// Shared library builds don't support whitebox tests -// that exercise internal symbols. -#if CONFIG_VP8 - vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 - vp9_rtcd(); -#endif // CONFIG_VP9 - vpx_dsp_rtcd(); - vpx_scale_rtcd(); -#endif // !CONFIG_SHARED - + ::libvpx_test::init_vpx_test(); return RUN_ALL_TESTS(); } diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index ca990f4dd..ee552113c 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -48,7 +48,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, #endif } - virtual ~TestVectorTest() { + ~TestVectorTest() override { if (md5_file_) fclose(md5_file_); } @@ -59,9 +59,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, } #if CONFIG_VP9_DECODER - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (video.frame_number() == 0 && mt_mode_ >= 0) { if (mt_mode_ == 1) { decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1); @@ -77,8 +76,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, } #endif - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index d92c13f88..dab6e531b 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -36,18 +36,18 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); } - virtual ~TileIndependenceTest() { + ~TileIndependenceTest() override { delete fw_dec_; delete inv_dec_; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); } @@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, md5->Add(img); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { UpdateMD5(fw_dec_, pkt, &md5_fw_order_); UpdateMD5(inv_dec_, pkt, &md5_inv_order_); } diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc index 645a9f2ff..00abf8f31 100644 --- a/test/timestamp_test.cc +++ b/test/timestamp_test.cc @@ -42,16 +42,16 @@ class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource { (static_cast<double>(framerate_numerator_) / framerate_denominator_); } - virtual vpx_codec_pts_t pts() const { + vpx_codec_pts_t pts() const override { return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() + starting_pts_ + 0.5); } - virtual unsigned long duration() const { + unsigned long duration() const override { return static_cast<unsigned long>(FrameDuration() + 0.5); } - virtual vpx_rational_t timebase() const { return timebase_; } + vpx_rational_t timebase() const override { return timebase_; } void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } @@ -67,9 +67,9 @@ class TimestampTest public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: TimestampTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~TimestampTest() {} + ~TimestampTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } diff --git a/test/tools_common.sh b/test/tools_common.sh index 0e4a0a5c0..d0dd24df3 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -280,7 +280,12 @@ run_tests() { test_end "${test}" done - local tested_config="$(test_configuration_target) @ $(current_hash)" + # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform + if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then + local tested_config="$(current_hash)" + else + local tested_config="$(test_configuration_target) @ $(current_hash)" + fi echo "${test_name}: Done, all tests pass for ${tested_config}." } diff --git a/test/variance_test.cc b/test/variance_test.cc index a6c8ef048..b8320e9ce 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -210,7 +210,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> { public: SumOfSquaresTest() : func_(GetParam()) {} - virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); } + ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); } protected: void ConstTest(); @@ -289,7 +289,7 @@ template <typename FunctionType> class MainTestClass : public ::testing::TestWithParam<TestParams<FunctionType> > { public: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -308,7 +308,7 @@ class MainTestClass #endif } - virtual void TearDown() { + void TearDown() override { #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth()) { // TODO(skal): remove! @@ -568,7 +568,7 @@ template <typename FunctionType> class SubpelVarianceTest : public ::testing::TestWithParam<TestParams<FunctionType> > { public: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -592,7 +592,7 @@ class SubpelVarianceTest ASSERT_NE(ref_, nullptr); } - virtual void TearDown() { + void TearDown() override { if (!use_high_bit_depth()) { vpx_free(src_); vpx_free(sec_); @@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } TEST_P(VpxMseTest, RefMse) { RefTestMse(); } TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } @@ -1428,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(5, 4, &vpx_variance32x16_avx2), VarianceParams(4, 5, &vpx_variance16x32_avx2), VarianceParams(4, 4, &vpx_variance16x16_avx2), - VarianceParams(4, 3, &vpx_variance16x8_avx2))); + VarianceParams(4, 3, &vpx_variance16x8_avx2), + VarianceParams(3, 4, &vpx_variance8x16_avx2), + VarianceParams(3, 3, &vpx_variance8x8_avx2), + VarianceParams(3, 2, &vpx_variance8x4_avx2))); INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelVarianceTest, @@ -1450,8 +1454,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, &vpx_get4x4sse_cs_neon))); INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, - ::testing::Values(MseParams(4, 4, - &vpx_mse16x16_neon))); + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), + MseParams(4, 3, &vpx_mse16x8_neon), + MseParams(3, 4, &vpx_mse8x16_neon), + MseParams(3, 3, &vpx_mse8x8_neon))); INSTANTIATE_TEST_SUITE_P( NEON, VpxVarianceTest, @@ -1469,6 +1475,35 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_neon), VarianceParams(2, 2, &vpx_variance4x4_neon))); +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxSseTest, + ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod), + MseParams(4, 3, &vpx_mse16x8_neon_dotprod), + MseParams(3, 4, &vpx_mse8x16_neon_dotprod), + MseParams(3, 3, &vpx_mse8x8_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod), + VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod), + VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod), + VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod), + VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod), + VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod), + VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod), + VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod), + VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod), + VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod), + VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), + VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), + VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); +#endif // HAVE_NEON_DOTPROD + INSTANTIATE_TEST_SUITE_P( NEON, VpxSubpelVarianceTest, ::testing::Values( @@ -1505,6 +1540,36 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); + +// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can +// be used again. +#if 0 +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); +#endif // HAVE_NEON_DOTPROD +#endif // 0 + +INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, ::testing::Values( VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), @@ -1572,6 +1637,10 @@ INSTANTIATE_TEST_SUITE_P( 12), SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, + 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, + 12), SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, 10), SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, @@ -1594,6 +1663,10 @@ INSTANTIATE_TEST_SUITE_P( 10), SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, + 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, + 10), SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, 8), SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, @@ -1613,7 +1686,9 @@ INSTANTIATE_TEST_SUITE_P( SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, 8), SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), - SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, 8))); INSTANTIATE_TEST_SUITE_P( @@ -1652,6 +1727,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, + 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, + 12), SubpelAvgVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, 10), @@ -1685,6 +1766,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, + 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, + 10), SubpelAvgVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, 8), @@ -1717,6 +1804,12 @@ INSTANTIATE_TEST_SUITE_P( 8), SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, + 8), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/video_source.h b/test/video_source.h index a10ff6fb0..2194126f1 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) { return fopen(path_to_source.c_str(), "rb"); } -static FILE *GetTempOutFile(std::string *file_name) { +static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) { file_name->clear(); #if defined(_WIN32) char fname[MAX_PATH]; @@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) { // Assume for now that the filename generated is unique per process if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { file_name->assign(fname); - return fopen(fname, "wb+"); + return fopen(fname, io_mode); } } return nullptr; @@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) { const int fd = mkstemp(temp_file_name.get()); if (fd == -1) return nullptr; *file_name = temp_file_name.get(); - return fdopen(fd, "wb+"); + return fdopen(fd, io_mode); #endif } class TempOutFile { public: - TempOutFile() { file_ = GetTempOutFile(&file_name_); } + TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); } + TempOutFile(const char *io_mode) { + file_ = GetTempOutFile(&file_name_, io_mode); + } ~TempOutFile() { CloseFile(); if (!file_name_.empty()) { @@ -160,35 +163,35 @@ class DummyVideoSource : public VideoSource { ReallocImage(); } - virtual ~DummyVideoSource() { vpx_img_free(img_); } + ~DummyVideoSource() override { vpx_img_free(img_); } - virtual void Begin() { + void Begin() override { frame_ = 0; FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_ : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { 1, 30 }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } void set_limit(unsigned int limit) { limit_ = limit; } @@ -235,7 +238,7 @@ class RandomVideoSource : public DummyVideoSource { protected: // Reset the RNG to get a matching stream for the second pass - virtual void Begin() { + void Begin() override { frame_ = 0; rnd_.Reset(seed_); FillFrame(); @@ -243,7 +246,7 @@ class RandomVideoSource : public DummyVideoSource { // 15 frames of noise, followed by 15 static frames. Reset to 0 rather // than holding previous frames to encourage keyframes to be thrown. - virtual void FillFrame() { + void FillFrame() override { if (img_) { if (frame_ % 30 < 15) { for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8(); diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc index 64a861d15..aee27af66 100644 --- a/test/vp8_datarate_test.cc +++ b/test/vp8_datarate_test.cc @@ -24,10 +24,10 @@ class DatarateTestLarge public: DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} - virtual ~DatarateTestLarge() {} + ~DatarateTestLarge() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); @@ -47,8 +47,8 @@ class DatarateTestLarge use_roi_ = false; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); @@ -74,7 +74,7 @@ class DatarateTestLarge duration_ = 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; @@ -121,7 +121,7 @@ class DatarateTestLarge ++frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { if (bits_total_) { const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit @@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) { class DatarateTestRealTime : public DatarateTestLarge { public: - virtual ~DatarateTestRealTime() {} + ~DatarateTestRealTime() override = default; }; #if CONFIG_TEMPORAL_DENOISING diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index 8cb84ddd8..7fa867d8b 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -30,11 +30,11 @@ namespace { const int kNumPixels = 16 * 16; class VP8DenoiserTest : public ::testing::TestWithParam<int> { public: - virtual ~VP8DenoiserTest() {} + ~VP8DenoiserTest() override = default; - virtual void SetUp() { increase_denoising_ = GetParam(); } + void SetUp() override { increase_denoising_ = GetParam(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int increase_denoising_; diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 1b73a72a0..66d5c151c 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -74,7 +74,7 @@ using libvpx_test::ACMRandom; class FdctTest : public ::testing::TestWithParam<FdctFunc> { public: - virtual void SetUp() { + void SetUp() override { fdct_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc index 6e5baf229..01b4c2120 100644 --- a/test/vp8_fragments_test.cc +++ b/test/vp8_fragments_test.cc @@ -17,9 +17,9 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} - virtual ~VP8FragmentsTest() {} + ~VP8FragmentsTest() override = default; - virtual void SetUp() { + void SetUp() override { const unsigned long init_flags = // NOLINT(runtime/int) VPX_CODEC_USE_OUTPUT_PARTITION; InitializeConfig(); diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 7410f3c01..9fbc1d4d9 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -25,7 +25,7 @@ namespace { struct Vp8RCTestVideo { - Vp8RCTestVideo() {} + Vp8RCTestVideo() = default; Vp8RCTestVideo(const char *name_, int width_, int height_, unsigned int frames_) : name(name_), width(width_), height(height_), frames(frames_) {} @@ -52,11 +52,12 @@ class Vp8RcInterfaceTest public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> { public: Vp8RcInterfaceTest() - : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {} - virtual ~Vp8RcInterfaceTest() {} + : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false), + frame_drop_thresh_(0) {} + ~Vp8RcInterfaceTest() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } @@ -111,8 +112,8 @@ class Vp8RcInterfaceTest return layer_id; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (rc_cfg_.ts_number_layers > 1) { const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); const int frame_flags = @@ -127,56 +128,79 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); - } else if (frame_params_.frame_type == INTER_FRAME) { + } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; } } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == test_video_.frames; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { if (encoder_exit_) { return; } int qp; encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - rc_api_->ComputeQP(frame_params_); - ASSERT_EQ(rc_api_->GetQP(), qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + } else { + num_drops_++; + } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { rc_api_->PostEncodeUpdate(pkt->data.frame.sz); } void RunOneLayer() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfig(); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, test_video_.frames); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); } void RunPeriodicKey() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; key_interval_ = 100; + frame_drop_thresh_ = 30; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -188,11 +212,9 @@ class Vp8RcInterfaceTest void RunTemporalLayers2TL() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(2); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -204,11 +226,9 @@ class Vp8RcInterfaceTest void RunTemporalLayers3TL() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(3); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -217,6 +237,28 @@ class Vp8RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunTemporalLayers3TLDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfigTemporalLayers(3); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + private: void SetConfig() { rc_cfg_.width = test_video_.width; @@ -232,6 +274,7 @@ class Vp8RcInterfaceTest rc_cfg_.max_intra_bitrate_pct = 1000; rc_cfg_.framerate = 30.0; rc_cfg_.layer_target_bitrate[0] = target_bitrate_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; // Encoder settings for ground truth. cfg_.g_w = test_video_.width; @@ -250,6 +293,7 @@ class Vp8RcInterfaceTest cfg_.rc_target_bitrate = target_bitrate_; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; } void SetConfigTemporalLayers(int temporal_layers) { @@ -265,6 +309,7 @@ class Vp8RcInterfaceTest rc_cfg_.overshoot_pct = 50; rc_cfg_.max_intra_bitrate_pct = 1000; rc_cfg_.framerate = 30.0; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; if (temporal_layers == 2) { rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100; rc_cfg_.layer_target_bitrate[1] = target_bitrate_; @@ -298,6 +343,7 @@ class Vp8RcInterfaceTest cfg_.rc_target_bitrate = target_bitrate_; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; // 2 Temporal layers, no spatial layers, CBR mode. cfg_.ss_number_layers = 1; cfg_.ts_number_layers = temporal_layers; @@ -325,16 +371,24 @@ class Vp8RcInterfaceTest Vp8RCTestVideo test_video_; libvpx::VP8FrameParamsQpRTC frame_params_; bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; }; TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); } + TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); } TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); } +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) { + RunTemporalLayers3TLDropFrames(); +} + VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, ::testing::Values(200, 400, 1000), ::testing::ValuesIn(kVp8RCTestVectors)); diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc index c7e6f1af0..3882326d2 100644 --- a/test/vp9_arf_freq_test.cc +++ b/test/vp9_arf_freq_test.cc @@ -86,9 +86,9 @@ class ArfFreqTest : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} - virtual ~ArfFreqTest() {} + ~ArfFreqTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(test_encode_param_.mode); if (test_encode_param_.mode != ::libvpx_test::kRealTime) { @@ -104,7 +104,7 @@ class ArfFreqTest dec_cfg_.threads = 4; } - virtual void BeginPassHook(unsigned int) { + void BeginPassHook(unsigned int) override { min_run_ = ARF_NOT_SEEN; run_of_visible_frames_ = 0; } @@ -126,7 +126,7 @@ class ArfFreqTest return frames; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; const int frames = GetNumFramesInPkt(pkt); if (frames == 1) { @@ -145,8 +145,8 @@ class ArfFreqTest } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index b93b014e6..0645341ac 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -53,14 +53,14 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff, class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> { public: - virtual ~BlockErrorTest() {} - virtual void SetUp() { + ~BlockErrorTest() override = default; + void SetUp() override { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: vpx_bit_depth_t bit_depth_; @@ -197,4 +197,22 @@ INSTANTIATE_TEST_SUITE_P( &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8))); #endif // HAVE_AVX2 + +#if HAVE_NEON +const BlockErrorParam neon_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>, + &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, + ::testing::ValuesIn(neon_block_error_tests)); +#endif // HAVE_NEON } // namespace diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh new file mode 100755 index 000000000..03843610d --- /dev/null +++ b/test/vp9_c_vs_simd_encode.sh @@ -0,0 +1,420 @@ +#!/bin/sh +## +## Copyright (c) 2023 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This script checks the bit exactness between C and SIMD +## implementations of VP9 encoder. +## +. $(dirname $0)/tools_common.sh + +TEST_BITRATES="1600 6400" +PRESETS="good rt" +TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input" +OUT_FILE_SUFFIX=".ivf" +SCRIPT_DIR=$(dirname "$0") +LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd) + +# Clips used in test. +YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv" +Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m" +Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" + +# Number of frames to test. +VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20 + +# Create a temporary directory for output files. +if [ -n "${TMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TEMPDIR}" +else + VPX_TEST_TEMP_ROOT=/tmp +fi + +VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$" + +if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \ + [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}" + exit 1 +fi + +elog() { + echo "$@" 1>&2 +} + +# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR}, +# or an empty string. Caller is responsible for testing the string once the +# function returns. +vp9_enc_tool_path() { + local target="$1" + local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc" + + if [ ! -x "${tool_path}" ]; then + tool_path="" + fi + echo "${tool_path}" +} + +# Environment check: Make sure input and source directories are available. +vp9_c_vs_simd_enc_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_720P_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then + elog "LIBVPX_SOURCE_DIR does not exist." + return 1 + fi +} + +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${VPX_TEST_OUTPUT_DIR} +# } + +# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture. +avx512f() { + echo "0x1FF" +} + +avx2() { + echo "0x0FF" +} + +sse4_1() { + echo "0x03F" +} + +ssse3() { + echo "0x01F" +} + +sse2() { + echo "0x007" +} + +# Echo clip details to be used as input to vpxenc. +yuv_raw_input() { + echo ""${YUV_RAW_INPUT}" + --width=352 + --height=288 + --bit-depth=8 + --profile=0" +} + +yuv_480p_raw_input() { + echo ""${YUV_480P_RAW_INPUT}" + --width=640 + --height=480 + --bit-depth=8 + --profile=0" +} + +y4m_720p_input() { + echo ""${Y4M_720P_INPUT}" + --bit-depth=8 + --profile=0" +} + +y4m_360p_10bit_input() { + echo ""${Y4M_360P_10BIT_INPUT}" + --bit-depth=10 + --profile=2" +} + +has_x86_isa_extn() { + instruction_set=$1 + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction_set is not supported. + return 1 + fi + # This instruction_set is supported. + return 0 +} + +# Echo good encode params for use with VP9 encoder. +vp9_encode_good_params() { + echo "--codec=vp9 \ + --good \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --end-usage=vbr \ + --kf-max-dist=160 \ + --kf-min-dist=0 \ + --lag-in-frames=19 \ + --max-q=63 \ + --min-q=0 \ + --passes=2 \ + --undershoot-pct=100 \ + --overshoot-pct=100 \ + --verbose \ + --auto-alt-ref=1 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=2000 \ + --arnr-maxframes=7 \ + --arnr-strength=5 \ + --sharpness=0 \ + --frame-parallel=0" +} + +# Echo realtime encode params for use with VP9 encoder. +vp9_encode_rt_params() { + echo "--codec=vp9 \ + --rt \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --tile-rows=0 \ + --end-usage=cbr \ + --kf-max-dist=90000 \ + --lag-in-frames=0 \ + --max-q=58 \ + --min-q=2 \ + --passes=1 \ + --undershoot-pct=50 \ + --overshoot-pct=50 \ + --verbose \ + --row-mt=0 \ + --buf-sz=1000 \ + --buf-initial-sz=500 \ + --buf-optimal-sz=600 \ + --max-intra-rate=300 \ + --resize-allowed=0 \ + --noise-sensitivity=0 \ + --aq-mode=3 \ + --error-resilient=0" +} + +# Configures for the given target in the +# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory. +vp9_enc_build() { + local target=$1 + local configure="$2" + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + mkdir -p "$tmp_build_dir" + local save_dir="$PWD" + cd "$tmp_build_dir" + + echo "Building target: ${target}" + local config_args="--disable-install-docs \ + --enable-unit-tests \ + --enable-debug \ + --enable-postproc \ + --enable-vp9-postproc \ + --enable-vp9-temporal-denoising \ + --enable-vp9-highbitdepth" + + eval "$configure" --target="${target}" "${config_args}" ${devnull} + eval make -j$(nproc) ${devnull} + echo "Done building target: ${target}" + cd "${save_dir}" +} + +compare_enc_output() { + local target=$1 + local cpu=$2 + local clip=$3 + local bitrate=$4 + local preset=$5 + if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then + elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" + return 1 + fi +} + +vp9_enc_test() { + local encoder="$1" + local target=$2 + if [ -z "$(vp9_enc_tool_path "${target}")" ]; then + elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path" + return 1 + fi + + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + local save_dir="$PWD" + cd "$tmp_build_dir" + for preset in ${PRESETS}; do + if [ "${preset}" = "good" ]; then + local max_cpu_used=5 + local test_params=vp9_encode_good_params + elif [ "${preset}" = "rt" ]; then + local max_cpu_used=9 + local test_params=vp9_encode_rt_params + else + elog "Invalid preset" + cd "${save_dir}" + return 1 + fi + + # Enable armv8 test for real-time only + if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then + continue + fi + + for cpu in $(seq 0 $max_cpu_used); do + for clip in ${TEST_CLIPS}; do + for bitrate in ${TEST_BITRATES}; do + eval "${encoder}" $($clip) $($test_params) \ + "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ + "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${devnull} + + if [ "${target}" != "generic-gnu" ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Find the mismatch + cd "${save_dir}" + return 1 + fi + fi + done + done + done + done + cd "${save_dir}" +} + +vp9_test_generic() { + local configure="$LIBVPX_SOURCE_DIR/configure" + local target="generic-gnu" + + echo "Build for: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + vp9_enc_test $encoder "${target}" +} + +# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are +# no functions with MMX, SSE, SSE3 and AVX specialization. +# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction +# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding +# instruction set extension optimization enabled are as follows: +# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants +# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants +# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as +# all x86_64 platforms implement sse2. +vp9_test_x86() { + local arch=$1 + + if ! uname -m | grep -q "x86"; then + elog "Machine architecture is not x86 or x86_64" + return 0 + fi + + if [ $arch = "x86" ]; then + local target="x86-linux-gcc" + elif [ $arch = "x86_64" ]; then + local target="x86_64-linux-gcc" + fi + + local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2" + local configure="$LIBVPX_SOURCE_DIR/configure" + + echo "Build for x86: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + for isa in $x86_isa_variants; do + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then + echo "${isa} is not supported in this machine" + continue + fi + export VPX_SIMD_CAPS_MASK=$($isa) + if ! vp9_enc_test $encoder ${target}; then + # Find the mismatch + return 1 + fi + unset VPX_SIMD_CAPS_MASK + done +} + +vp9_test_arm() { + local target="armv8-linux-gcc" + local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \ + --extra-cxxflags=-march=armv8.4-a" + echo "Build for arm64: ${target}" + vp9_enc_build ${target} "${configure}" + + local encoder="$(vp9_enc_tool_path "${target}")" + if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then + # Find the mismatch + return 1 + fi +} + +vp9_c_vs_simd_enc_test() { + # Test Generic + vp9_test_generic + + # Test x86 (32 bit) + echo "vp9 test for x86 (32 bit): Started." + if ! vp9_test_x86 "x86"; then + echo "vp9 test for x86 (32 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86 (32 bit): Done, all tests passed." + fi + + # Test x86_64 (64 bit) + if [ "$(eval uname -m)" = "x86_64" ]; then + echo "vp9 test for x86_64 (64 bit): Started." + if ! vp9_test_x86 "x86_64"; then + echo "vp9 test for x86_64 (64 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86_64 (64 bit): Done, all tests passed." + fi + fi + + # Test ARM + echo "vp9_test_arm: Started." + if ! vp9_test_arm; then + echo "vp9 test for arm: Done, test failed." + return 1 + else + echo "vp9 test for arm: Done, all tests passed." + fi +} + +# Setup a trap function to clean up build, and output files after tests complete. +# trap cleanup EXIT + +run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 7e9180749..4bc909920 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { } protected: - virtual ~DatarateTestVP9() {} + ~DatarateTestVP9() override = default; virtual void ResetModel() { last_pts_ = 0; @@ -113,8 +113,8 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { return layer_id; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); @@ -164,7 +164,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { duration_ = 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; @@ -202,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { ++tot_frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers); ++layer) { duration_ = (last_pts_ + 1) * timebase_; @@ -243,7 +243,7 @@ class DatarateTestVP9RealTimeMultiBR DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -259,7 +259,7 @@ class DatarateTestVP9LargeVBR DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -579,10 +579,10 @@ class DatarateTestVP9RealTime : public DatarateTestVP9, public ::libvpx_test::CodecTestWithParam<int> { public: DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} - virtual ~DatarateTestVP9RealTime() {} + ~DatarateTestVP9RealTime() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -731,10 +731,10 @@ class DatarateTestVP9RealTimeDeltaQUV public ::libvpx_test::CodecTestWith2Params<int, int> { public: DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {} - virtual ~DatarateTestVP9RealTimeDeltaQUV() {} + ~DatarateTestVP9RealTimeDeltaQUV() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -779,7 +779,7 @@ class DatarateTestVP9PostEncodeDrop DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -819,17 +819,17 @@ class DatarateTestVP9FrameQp public ::testing::TestWithParam<const libvpx_test::CodecFactory *> { public: DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} - virtual ~DatarateTestVP9FrameQp() {} + ~DatarateTestVP9FrameQp() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); ResetModel(); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { set_cpu_used_ = 7; DatarateTestVP9::PreEncodeFrameHook(video, encoder); frame_qp_ = static_cast<int>(rnd_.RandRange(64)); @@ -837,7 +837,7 @@ class DatarateTestVP9FrameQp frame_++; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { int qp = 0; vpx_svc_layer_id_t layer_id; if (frame_ >= total_frame_) return; @@ -847,8 +847,8 @@ class DatarateTestVP9FrameQp temporal_layer_id_ = layer_id.temporal_layer_id; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { if (frame_ >= total_frame_) return; ASSERT_TRUE(cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212 && @@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { public: - virtual ~DatarateTestVP9RealTimeDenoiser() {} + ~DatarateTestVP9RealTimeDenoiser() override = default; }; // Check basic datarate targeting, for a single bitrate, when denoiser is on. diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index d884b7eb9..831f83305 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -42,11 +42,11 @@ class VP9DenoiserTest : public ::testing::Test, public ::testing::WithParamInterface<VP9DenoiserTestParam> { public: - virtual ~VP9DenoiserTest() {} + ~VP9DenoiserTest() override = default; - virtual void SetUp() { bs_ = GET_PARAM(1); } + void SetUp() override { bs_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: BLOCK_SIZE bs_; diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index ce2198c59..0e182c76d 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -62,9 +62,9 @@ class VpxEncoderParmsGetToDecoder VpxEncoderParmsGetToDecoder() : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} - virtual ~VpxEncoderParmsGetToDecoder() {} + ~VpxEncoderParmsGetToDecoder() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kTwoPassGood); cfg_.g_lag_in_frames = 25; @@ -74,8 +74,8 @@ class VpxEncoderParmsGetToDecoder cfg_.rc_target_bitrate = test_video_.bitrate; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range); @@ -95,9 +95,9 @@ class VpxEncoderParmsGetToDecoder } } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); vpx_codec_alg_priv_t *const priv = reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv); diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 7a85db26a..79be4ee14 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -89,9 +89,9 @@ class EndToEndTestAdaptiveRDThresh : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), cpu_used_end_(GET_PARAM(2)) {} - virtual ~EndToEndTestAdaptiveRDThresh() {} + ~EndToEndTestAdaptiveRDThresh() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_lag_in_frames = 0; @@ -102,8 +102,8 @@ class EndToEndTestAdaptiveRDThresh dec_cfg_.threads = 4; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_); encoder->Control(VP9E_SET_ROW_MT, 1); @@ -131,9 +131,9 @@ class EndToEndTestLarge denoiser_on_ = 0; } - virtual ~EndToEndTestLarge() {} + ~EndToEndTestLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -149,18 +149,18 @@ class EndToEndTestLarge dec_cfg_.threads = 4; } - virtual void BeginPassHook(unsigned int) { + void BeginPassHook(unsigned int) override { psnr_ = 0.0; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); @@ -207,9 +207,9 @@ class EndToEndTestLoopFilterThreading EndToEndTestLoopFilterThreading() : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} - virtual ~EndToEndTestLoopFilterThreading() {} + ~EndToEndTestLoopFilterThreading() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_threads = 2; @@ -221,16 +221,16 @@ class EndToEndTestLoopFilterThreading dec_cfg_.threads = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 8); } encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5); } - virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { if (video->frame() == 0) { decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0); } diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 238366cb6..c8d3cba7f 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -21,12 +21,12 @@ namespace { // FIRSTPASS_STATS struct: // { -// 25 double members; +// 26 double members; // 1 int64_t member; // } // Whenever FIRSTPASS_STATS struct is modified, the following constants need to // be revisited. -const int kDbl = 25; +const int kDbl = 26; const int kInt = 1; const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t); @@ -44,9 +44,9 @@ class VPxFirstPassEncoderThreadTest firstpass_stats_.buf = nullptr; firstpass_stats_.sz = 0; } - virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); } + ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -57,19 +57,19 @@ class VPxFirstPassEncoderThreadTest cfg_.rc_min_quantizer = 0; } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { encoder_initialized_ = false; abort_ = false; } - virtual void EndPassHook() { + void EndPassHook() override { // For first pass stats test, only run first pass encoder. if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS) abort_ |= first_pass_only_; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { if (!encoder_initialized_) { // Encode in 2-pass mode. encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); @@ -87,7 +87,7 @@ class VPxFirstPassEncoderThreadTest } } - virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) { + void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override { const uint8_t *const pkt_buf = reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf); const size_t pkt_size = pkt->data.twopass_stats.sz; @@ -185,7 +185,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if using or not using row-mt generates close stats. - ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); // Test single thread vs multiple threads row_mt_mode_ = 1; @@ -199,7 +199,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if single-thread and multi-thread stats are close enough. - ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); // Bit exact test in row_mt mode. // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact @@ -233,9 +233,9 @@ class VPxEncoderThreadTest psnr_ = 0.0; nframes_ = 0; } - virtual ~VPxEncoderThreadTest() {} + ~VPxEncoderThreadTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -252,14 +252,14 @@ class VPxEncoderThreadTest cfg_.rc_min_quantizer = 0; } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { encoder_initialized_ = false; psnr_ = 0.0; nframes_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { if (!encoder_initialized_) { // Encode 4 column tiles. encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); @@ -280,21 +280,21 @@ class VPxEncoderThreadTest } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t /*pts*/) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t /*pts*/) override { ::libvpx_test::MD5 md5_res; md5_res.Add(&img); md5_.push_back(md5_res.Get()); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder * /*decoder*/) { + bool HandleDecodeResult(const vpx_codec_err_t res, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder * /*decoder*/) override { if (res != VPX_CODEC_OK) { EXPECT_EQ(VPX_CODEC_OK, res); return false; diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 2bfa6281d..33fa05c65 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -18,6 +18,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "vp9/simple_encode.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" #include "vpx_dsp/vpx_dsp_common.h" namespace { @@ -41,7 +42,7 @@ constexpr int kDefaultMaxGfInterval = 16; constexpr int kReadMinGfInterval = 5; constexpr int kReadMaxGfInterval = 13; const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; -const double kPsnrThreshold = 30.50; +const double kPsnrThreshold = 30.4; struct ToyRateCtrl { int magic_number; @@ -151,6 +152,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop_short( return VPX_RC_OK; } +vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, + const VpxTplGopStats *tpl_gop_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_GT(tpl_gop_stats->size, 0); + + for (int i = 0; i < tpl_gop_stats->size; ++i) { + EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0); + } + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, @@ -384,7 +398,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( EXPECT_EQ(encode_frame_info->show_index, 3); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); } // When the model recommends an invalid q, valid range [0, 255], @@ -678,7 +692,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_QP; rc_funcs.create_model = rc_create_model; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; @@ -721,10 +735,11 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats; rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; rc_funcs.get_gop_decision = rc_get_gop_decision; rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; @@ -768,7 +783,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -816,7 +831,7 @@ class ExtRateCtrlTestGOPShortOverlay encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -865,7 +880,7 @@ class ExtRateCtrlTestGOPShortNoARF encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -919,7 +934,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs; + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index ccace719e..c69d43efb 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> { ref_dst_ = ref_dst; int error_count = 0; for (int i = 0; i < count_test_block; ++i) { + // TODO(webm:1797): Some of the optimised predictor implementations rely + // on the trailing half of the above_row_ being a copy of the final + // element, however relying on this in some cases can cause the MD5 tests + // to fail. We have fixed all of these cases for Neon, so fill the whole + // of above_row_ randomly. +#if HAVE_NEON + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < 2 * block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } +#else // Fill edges with random data, try first with saturated values. for (int x = -1; x < block_size; x++) { if (i == 0) { @@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> { for (int x = block_size; x < 2 * block_size; x++) { above_row_[x] = above_row_[block_size - 1]; } +#endif for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; @@ -80,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> { } protected: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); stride_ = params_.block_size * 3; mask_ = (1 << params_.bit_depth) - 1; @@ -243,6 +259,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d45_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d45_predictor_32x32_neon, &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_16x16_neon, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_neon, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d117_predictor_16x16_neon, + &vpx_d117_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d117_predictor_32x32_neon, + &vpx_d117_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, @@ -251,6 +283,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d135_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d135_predictor_32x32_neon, &vpx_d135_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_neon, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_neon, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_neon, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_neon, + &vpx_d207_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_dc_128_predictor_4x4_neon, &vpx_dc_128_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_dc_128_predictor_8x8_neon, @@ -441,6 +489,15 @@ INSTANTIATE_TEST_SUITE_P( &vpx_v_predictor_32x32_c, 32, 8))); #endif // HAVE_VSX +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_lsx, + &vpx_dc_predictor_16x16_c, 16, 8))); +#endif // HAVE_LSX + #if CONFIG_VP9_HIGHBITDEPTH typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, @@ -832,6 +889,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -840,6 +913,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -908,6 +997,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -916,6 +1021,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -984,6 +1105,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -992,6 +1129,22 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc index 931ac30a3..fe3cd1aba 100644 --- a/test/vp9_lossless_test.cc +++ b/test/vp9_lossless_test.cc @@ -29,15 +29,15 @@ class LosslessTest : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)) {} - virtual ~LosslessTest() {} + ~LosslessTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { // Only call Control if quantizer > 0 to verify that using quantizer // alone will activate lossless @@ -47,12 +47,12 @@ class LosslessTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0]; } diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index 6b1082a10..495ea11fc 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -42,9 +42,9 @@ class MotionVectorTestLarge : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} - virtual ~MotionVectorTestLarge() {} + ~MotionVectorTestLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -59,8 +59,8 @@ class MotionVectorTestLarge } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 587cec692..e00ab4022 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,6 +26,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -38,33 +39,44 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, + const macroblock_plane *mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct ScanOrder *const scan_order); typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t, int /*max_size*/, bool /*is_fp*/> QuantizeParam; +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); + +template <Quantize32x32Func fn> +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); +} + // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *round, const int16_t *quant, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct ScanOrder *const scan_order); template <QuantizeFPFunc fn> void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, const int16_t *scan, - const int16_t *iscan) { - (void)zbin; - (void)quant_shift; - - fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -119,17 +131,21 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = + + mb_plane_ = reinterpret_cast<macroblock_plane *>( + vpx_memalign(16, sizeof(macroblock_plane))); + + zbin_ptr_ = mb_plane_->zbin = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = reinterpret_cast<int16_t *>( + round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); - quant_fp_ptr_ = reinterpret_cast<int16_t *>( + quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = + round_ptr_ = mb_plane_->round = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = + quant_ptr_ = mb_plane_->quant = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = reinterpret_cast<int16_t *>( + quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -138,7 +154,8 @@ class VP9QuantizeBase : public AbstractBench { q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; } - ~VP9QuantizeBase() { + ~VP9QuantizeBase() override { + vpx_free(mb_plane_); vpx_free(zbin_ptr_); vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); @@ -146,6 +163,7 @@ class VP9QuantizeBase : public AbstractBench { vpx_free(quant_ptr_); vpx_free(quant_shift_ptr_); vpx_free(dequant_ptr_); + mb_plane_ = nullptr; zbin_ptr_ = nullptr; round_fp_ptr_ = nullptr; quant_fp_ptr_ = nullptr; @@ -157,9 +175,10 @@ class VP9QuantizeBase : public AbstractBench { } protected: + macroblock_plane *mb_plane_; int16_t *zbin_ptr_; - int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; + int16_t *round_fp_ptr_; int16_t *round_ptr_; int16_t *quant_ptr_; int16_t *quant_shift_ptr_; @@ -174,7 +193,7 @@ class VP9QuantizeBase : public AbstractBench { int16_t *r_ptr_; int16_t *q_ptr_; int count_; - const scan_order *scan_; + const ScanOrder *scan_; uint16_t eob_; }; @@ -186,17 +205,15 @@ class VP9QuantizeTest : public VP9QuantizeBase, quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: - virtual void Run(); + void Run() override; void Speed(bool is_median); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, - scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_); } void VP9QuantizeTest::Speed(bool is_median) { @@ -266,19 +283,18 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + scan_); } vpx_usec_timer_mark(&timer); vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, - scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_); } vpx_usec_timer_mark(&simd_timer); @@ -298,14 +314,16 @@ void VP9QuantizeTest::Speed(bool is_median) { // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, + const struct ScanOrder *const scan_order, int is_32x32) { int i, eob = -1; const int thr = dequant_ptr[1] >> (1 + is_32x32); - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. @@ -372,21 +390,21 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 0); } void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 1); } TEST_P(VP9QuantizeTest, OperationCheck) { @@ -417,15 +435,13 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -475,15 +491,13 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -510,27 +524,30 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, @@ -541,11 +558,11 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), @@ -555,13 +572,13 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -577,22 +594,26 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>, &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -602,11 +623,11 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>, &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -617,20 +638,24 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>, + VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, @@ -641,9 +666,9 @@ INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), @@ -670,13 +695,13 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_lsx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_lsx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + LSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_lsx>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false))); #endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH // Only useful to compare "Speed" test results. @@ -684,8 +709,9 @@ INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, - 32, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper<quantize_fp_nz_c>, diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 1d1a78f43..f7be47542 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -31,6 +31,7 @@ const int kTemporalId2Layer[2] = { 0, 1 }; const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; +const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 }; class RcInterfaceTest : public ::libvpx_test::EncoderTest, @@ -38,28 +39,34 @@ class RcInterfaceTest public: RcInterfaceTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), - encoder_exit_(false) {} + encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {} - virtual ~RcInterfaceTest() {} + ~RcInterfaceTest() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); - encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + if (rc_cfg_.is_screen) { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); + } else { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT); + } encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; - if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + if (rc_cfg_.rc_mode == VPX_CBR && + frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; @@ -67,20 +74,23 @@ class RcInterfaceTest encoder_exit_ = video->frame() == kNumFrames; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { if (encoder_exit_) { return; } int loopfilter_level, qp; encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - rc_api_->ComputeQP(frame_params_); - ASSERT_EQ(rc_api_->GetQP(), qp); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + } else { + num_drops_++; + } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_); } void RunOneLayer() { @@ -95,6 +105,42 @@ class RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunOneLayerScreen() { + SetConfig(GET_PARAM(2)); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFramesCBR() { + if (GET_PARAM(2) != VPX_CBR) { + GTEST_SKIP() << "Frame dropping is only for CBR mode."; + } + frame_drop_thresh_ = 30; + SetConfig(GET_PARAM(2)); + // Use lower bitrate, lower max-q, and enable frame dropper. + rc_cfg_.target_bandwidth = 200; + cfg_.rc_target_bitrate = 200; + rc_cfg_.max_quantizer = 50; + cfg_.rc_max_quantizer = 50; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + void RunOneLayerVBRPeriodicKey() { if (GET_PARAM(2) != VPX_VBR) return; key_interval_ = 100; @@ -132,6 +178,7 @@ class RcInterfaceTest rc_cfg_.min_quantizers[0] = 2; rc_cfg_.rc_mode = rc_mode; rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; // Encoder settings for ground truth. cfg_.g_w = 1280; @@ -150,6 +197,7 @@ class RcInterfaceTest cfg_.rc_target_bitrate = 1000; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; } std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_; @@ -158,23 +206,31 @@ class RcInterfaceTest int key_interval_; libvpx::VP9FrameParamsQpRTC frame_params_; bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; }; -class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { +class RcInterfaceSvcTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params<int, bool> { public: - RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {} - virtual ~RcInterfaceSvcTest() {} + RcInterfaceSvcTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), + parallel_spatial_layers_(false), frame_drop_thresh_(0), + max_consec_drop_(INT_MAX), num_drops_(0) {} + ~RcInterfaceSvcTest() override = default; protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { + current_superframe_ = 0; encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); encoder->Control(VP9E_SET_TUNE_CONTENT, 0); @@ -182,11 +238,23 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + if (inter_layer_pred_off_) { + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, + INTER_LAYER_PRED_OFF_NONKEY); + } + if (frame_drop_thresh_ > 0) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_; + svc_drop_frame.max_consec_drop = max_consec_drop_; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == kNumFrames; - current_superframe_ = video->frame(); if (dynamic_spatial_layers_ == 1) { if (video->frame() == 100) { // Go down to 2 spatial layers: set top SL to 0 bitrate. @@ -201,7 +269,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.layer_target_bitrate[6] = 0; rc_cfg_.layer_target_bitrate[7] = 0; rc_cfg_.layer_target_bitrate[8] = 0; - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); } else if (video->frame() == 200) { // Go down to 1 spatial layer. // Update the encoder config. @@ -215,8 +283,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.layer_target_bitrate[3] = 0; rc_cfg_.layer_target_bitrate[4] = 0; rc_cfg_.layer_target_bitrate[5] = 0; - rc_api_->UpdateRateControl(rc_cfg_); - } else if (0 && video->frame() == 280) { + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) { // TODO(marpan): Re-enable this going back up when issue is fixed. // Go back up to 3 spatial layers. // Update the encoder config: use the original bitrates. @@ -224,52 +292,92 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Config(&cfg_); // Update the RC config. SetRCConfigSvc(3, 3); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); } } } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + virtual void SetFrameParamsSvc(int sl) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + frame_params_.frame_type = + current_superframe_ % key_interval_ == 0 && sl == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int superframe_is_dropped = false; ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; + std::vector<int> rc_qp; + // For FULL_SUPERFRAME_DROP: the full superframe drop decision is + // determined on the base spatial layer. + SetFrameParamsSvc(0); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) { + superframe_is_dropped = true; + num_drops_++; + } while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + ASSERT_EQ(superframe_is_dropped, false); ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz); - for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { - if (sizes_[sl] > 0) { - frame_params_.spatial_layer_id = sl; - if (rc_cfg_.ts_number_layers == 3) - frame_params_.temporal_layer_id = - kTemporalId3Layer[current_superframe_ % 4]; - else if (rc_cfg_.ts_number_layers == 2) - frame_params_.temporal_layer_id = - kTemporalId2Layer[current_superframe_ % 2]; - else - frame_params_.temporal_layer_id = 0; - rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = INTER_FRAME; - rc_api_->PostEncodeUpdate(sizes_[sl]); + if (!parallel_spatial_layers_ || current_superframe_ == 0) { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + // For sl=0 ComputeQP() is already called above (line 310). + if (sl > 0) rc_api_->ComputeQP(frame_params_); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } else { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + // For sl=0 ComputeQP() is already called above (line 310). + if (sizes_[sl] > 0 && sl > 0) { + SetFrameParamsSvc(sl); + rc_api_->ComputeQP(frame_params_); + } + } + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } } } } - if (!encoder_exit_) { - int loopfilter_level, qp; + if (!superframe_is_dropped) { + int loopfilter_level; + std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0); encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); - encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - ASSERT_EQ(rc_api_->GetQP(), qp); + encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data()); + encoder_qp.resize(rc_qp.size()); + ASSERT_EQ(rc_qp, encoder_qp); ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + current_superframe_++; } } // This method needs to be overridden because non-reference frames are // expected to be mismatched frames as the encoder will avoid loopfilter on // these frames. - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) {} + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} void RunSvc() { - dynamic_spatial_layers_ = 0; SetRCConfigSvc(3, 3); - key_interval_ = 10000; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); SetEncoderConfigSvc(3, 3); @@ -279,8 +387,22 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunSvcDropFramesCBR() { + max_consec_drop_ = 10; + frame_drop_thresh_ = 30; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + void RunSvcPeriodicKey() { - dynamic_spatial_layers_ = 0; SetRCConfigSvc(3, 3); key_interval_ = 100; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); @@ -295,7 +417,19 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, void RunSvcDynamicSpatial() { dynamic_spatial_layers_ = 1; SetRCConfigSvc(3, 3); - key_interval_ = 10000; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcParallelSpatialLayers() { + if (!inter_layer_pred_off_) return; + parallel_spatial_layers_ = true; + SetRCConfigSvc(3, 3); rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); SetEncoderConfigSvc(3, 3); @@ -391,12 +525,14 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, cfg_.kf_max_dist = 9999; cfg_.rc_overshoot_pct = 50; cfg_.rc_undershoot_pct = 50; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; cfg_.rc_target_bitrate = 0; for (int sl = 0; sl < number_spatial_layers; sl++) { int spatial_bitrate = 0; if (number_spatial_layers <= 3) - spatial_bitrate = kSpatialLayerBitrate[sl]; + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; for (int tl = 0; tl < number_temporal_layers; tl++) { int layer = sl * number_temporal_layers + tl; if (number_temporal_layers == 3) @@ -431,6 +567,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.framerate = 30.0; rc_cfg_.rc_mode = VPX_CBR; rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + rc_cfg_.max_consec_drop = max_consec_drop_; if (number_spatial_layers == 3) { rc_cfg_.scaling_factor_num[0] = 1; @@ -464,7 +602,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, for (int sl = 0; sl < number_spatial_layers; sl++) { int spatial_bitrate = 0; if (number_spatial_layers <= 3) - spatial_bitrate = kSpatialLayerBitrate[sl]; + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; for (int tl = 0; tl < number_temporal_layers; tl++) { int layer = sl * number_temporal_layers + tl; if (number_temporal_layers == 3) @@ -498,19 +637,36 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, uint32_t sizes_[8]; int key_interval_; int dynamic_spatial_layers_; + bool inter_layer_pred_off_; + // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC. + bool parallel_spatial_layers_; + int frame_drop_thresh_; + int max_consec_drop_; + int num_drops_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); } + +TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } +TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); } + +TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) { + RunSvcParallelSpatialLayers(); +} + TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); -VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3), + ::testing::Values(true, false)); } // namespace diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc index e8373c4c0..a9347fb36 100644 --- a/test/vp9_roi_test.cc +++ b/test/vp9_roi_test.cc @@ -84,9 +84,9 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {} - virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); } + ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); SetRoi(); @@ -114,8 +114,8 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, 3); diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc index 2d1203fb8..049a10a61 100644 --- a/test/vp9_scale_test.cc +++ b/test/vp9_scale_test.cc @@ -33,10 +33,10 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, class ScaleTest : public VpxScaleBase, public ::testing::TestWithParam<ScaleFrameFunc> { public: - virtual ~ScaleTest() {} + ~ScaleTest() override = default; protected: - virtual void SetUp() { scale_fn_ = GetParam(); } + void SetUp() override { scale_fn_ = GetParam(); } void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index a57082f1e..78deb5190 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -34,10 +34,10 @@ namespace vp9 { class VP9SubtractBlockTest : public AbstractBench, public ::testing::TestWithParam<SubtractFunc> { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run() { + void Run() override { GetParam()(block_height_, block_width_, diff_, block_width_, src_, block_width_, pred_, block_width_); } @@ -176,7 +176,7 @@ using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>; class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> { public: - virtual void SetUp() { + void SetUp() override { block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1)); @@ -198,7 +198,7 @@ class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> { ASSERT_NE(diff_, nullptr); } - virtual void TearDown() { + void TearDown() override { vpx_free(CONVERT_TO_SHORTPTR(src_)); vpx_free(CONVERT_TO_SHORTPTR(pred_)); vpx_free(diff_); diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 1ceef8185..c0cea681d 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -26,10 +26,10 @@ using std::string; class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> { protected: - virtual ~VPxWorkerThreadTest() {} - virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); } + ~VPxWorkerThreadTest() override = default; + void SetUp() override { vpx_get_worker_interface()->init(&worker_); } - virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); } + void TearDown() override { vpx_get_worker_interface()->end(&worker_); } void Run(VPxWorker *worker) { const bool synchronous = GetParam(); diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc index 7eea437fc..3897a6088 100644 --- a/test/vpx_scale_test.cc +++ b/test/vpx_scale_test.cc @@ -38,10 +38,10 @@ class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam<ExtendFrameBorderFunc> { public: - virtual ~ExtendBorderTest() {} + ~ExtendBorderTest() override = default; protected: - virtual void SetUp() { extend_fn_ = GetParam(); } + void SetUp() override { extend_fn_ = GetParam(); } void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); } @@ -68,10 +68,10 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, class CopyFrameTest : public VpxScaleBase, public ::testing::TestWithParam<CopyFrameFunc> { public: - virtual ~CopyFrameTest() {} + ~CopyFrameTest() override = default; protected: - virtual void SetUp() { copy_frame_fn_ = GetParam(); } + void SetUp() override { copy_frame_fn_ = GetParam(); } void CopyFrame() { ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); diff --git a/test/webm_video_source.h b/test/webm_video_source.h index d24592629..6ab50c849 100644 --- a/test/webm_video_source.h +++ b/test/webm_video_source.h @@ -29,16 +29,16 @@ class WebMVideoSource : public CompressedVideoSource { webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), end_of_file_(false) {} - virtual ~WebMVideoSource() { + ~WebMVideoSource() override { if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); webm_free(webm_ctx_); delete vpx_ctx_; delete webm_ctx_; } - virtual void Init() {} + void Init() override {} - virtual void Begin() { + void Begin() override { vpx_ctx_->file = OpenTestDataFile(file_name_); ASSERT_NE(vpx_ctx_->file, nullptr) << "Input file open failed. Filename: " << file_name_; @@ -48,7 +48,7 @@ class WebMVideoSource : public CompressedVideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } @@ -74,11 +74,11 @@ class WebMVideoSource : public CompressedVideoSource { } while (!webm_ctx_->is_key_frame && !end_of_file_); } - virtual const uint8_t *cxdata() const { + const uint8_t *cxdata() const override { return end_of_file_ ? nullptr : buf_; } - virtual size_t frame_size() const { return buf_sz_; } - virtual unsigned int frame_number() const { return frame_; } + size_t frame_size() const override { return buf_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 32f2cd51d..78a944fd0 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>, protected: Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} - virtual ~Y4mVideoSourceTest() { CloseSource(); } + ~Y4mVideoSourceTest() override { CloseSource(); } virtual void Init(const std::string &file_name, int limit) { file_name_ = file_name; @@ -140,7 +140,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: Y4mVideoWriteTest() : tmpfile_(nullptr) {} - virtual ~Y4mVideoWriteTest() { + ~Y4mVideoWriteTest() override { delete tmpfile_; input_file_ = nullptr; } @@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { ReplaceInputFile(tmpfile_->file()); } - virtual void Init(const std::string &file_name, int limit) { + void Init(const std::string &file_name, int limit) override { Y4mVideoSourceTest::Init(file_name, limit); WriteY4mAndReadBack(); } diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h index 71fbf3193..e43e37d9e 100644 --- a/test/y4m_video_source.h +++ b/test/y4m_video_source.h @@ -27,7 +27,7 @@ class Y4mVideoSource : public VideoSource { start_(start), limit_(limit), frame_(0), framerate_numerator_(0), framerate_denominator_(0), y4m_() {} - virtual ~Y4mVideoSource() { + ~Y4mVideoSource() override { vpx_img_free(img_.get()); CloseSource(); } @@ -51,33 +51,33 @@ class Y4mVideoSource : public VideoSource { FillFrame(); } - virtual void Begin() { + void Begin() override { OpenSource(); ReadSourceToStart(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_.get() : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void FillFrame() { ASSERT_NE(input_file_, nullptr); diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index 2bdcf4d86..0677d5568 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -290,7 +290,7 @@ void ApplyReferenceFilter( class YUVTemporalFilterTest : public ::testing::TestWithParam<TemporalFilterWithBd> { public: - virtual void SetUp() { + void SetUp() override { filter_func_ = GetParam().temporal_filter; bd_ = GetParam().bd; use_highbd_ = (bd_ != 8); @@ -694,6 +694,18 @@ INSTANTIATE_TEST_SUITE_P( TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12, 12))); #endif // HAVE_SSE4_1 +#if HAVE_NEON +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12) + +INSTANTIATE_TEST_SUITE_P( + NEON, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12, + 12))); +#endif // HAVE_NEON #else INSTANTIATE_TEST_SUITE_P( C, YUVTemporalFilterTest, @@ -704,5 +716,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest, ::testing::Values(TemporalFilterWithBd( &vp9_apply_temporal_filter_sse4_1, 8))); #endif // HAVE_SSE4_1 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_neon, 8))); +#endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH + } // namespace diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h index 51948c0ef..bb5eec5bb 100644 --- a/test/yuv_video_source.h +++ b/test/yuv_video_source.h @@ -35,12 +35,12 @@ class YUVVideoSource : public VideoSource { SetSize(width, height, format); } - virtual ~YUVVideoSource() { + ~YUVVideoSource() override { vpx_img_free(img_); if (input_file_) fclose(input_file_); } - virtual void Begin() { + void Begin() override { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); ASSERT_NE(input_file_, nullptr) @@ -53,28 +53,28 @@ class YUVVideoSource : public VideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_ : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void SetSize(unsigned int width, unsigned int height, vpx_img_fmt format) { diff --git a/third_party/libwebm/AUTHORS.TXT b/third_party/libwebm/AUTHORS.TXT index 9686ac13e..59b648ca6 100644 --- a/third_party/libwebm/AUTHORS.TXT +++ b/third_party/libwebm/AUTHORS.TXT @@ -2,3 +2,4 @@ # Name or Organization <email address> Google Inc. +Elijah Cirioli <eli.cirioli@gmail.com> diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk index 23f935f2d..b02795cca 100644 --- a/third_party/libwebm/Android.mk +++ b/third_party/libwebm/Android.mk @@ -1,3 +1,5 @@ +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) @@ -18,3 +20,4 @@ LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD LOCAL_LICENSE_CONDITIONS := notice LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT include $(BUILD_STATIC_LIBRARY) +endif # NDK_ROOT diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx index 325604cc6..a79b982ef 100644 --- a/third_party/libwebm/README.libvpx +++ b/third_party/libwebm/README.libvpx @@ -1,7 +1,7 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: ee0bab576c338c9807249b99588e352b7268cb62 +Version: 1930e3ca23b007f3ff11d98a570077be6201957e License: BSD -License File: LICENSE.txt +License File: LICENSE.TXT Description: libwebm is used to handle WebM container I/O. @@ -18,3 +18,4 @@ Only keep: - mkvmuxer/ - mkvparser/ - PATENTS.TXT +- use -std=gnu++11 in Android.mk (https://crbug.com/webm/1708) diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index ae3653143..faaf0165f 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -607,10 +607,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const { return true; } -uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, +uint64_t ContentEncoding::EncodingSize(uint64_t compression_size, uint64_t encryption_size) const { // TODO(fgalligan): Add support for compression settings. - if (compresion_size != 0) + if (compression_size != 0) return 0; uint64_t encoding_size = 0; diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h index f2db37714..8602d8232 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -330,7 +330,7 @@ class ContentEncoding { private: // Returns the size in bytes for the encoding elements. - uint64_t EncodingSize(uint64_t compresion_size, + uint64_t EncodingSize(uint64_t compression_size, uint64_t encryption_size) const; // Returns the size in bytes for the encryption elements. @@ -1425,7 +1425,7 @@ class SeekHead { bool Write(IMkvWriter* writer); // We are going to put a cap on the number of Seek Entries. - const static int32_t kSeekEntryCount = 5; + constexpr static int32_t kSeekEntryCount = 5; private: // Returns the maximum size in bytes of one seek entry. @@ -1505,8 +1505,8 @@ class Segment { kBeforeClusters = 0x1 // Position Cues before Clusters }; - static const uint32_t kDefaultDocTypeVersion = 4; - static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + static constexpr uint32_t kDefaultDocTypeVersion = 4; + static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL; Segment(); ~Segment(); diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index bd2f76913..300b15579 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -607,7 +607,7 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 3; - *build = 0; + *build = 1; *revision = 0; } diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc index de8884b38..868afcb3e 100644 --- a/third_party/libwebm/mkvparser/mkvparser.cc +++ b/third_party/libwebm/mkvparser/mkvparser.cc @@ -55,7 +55,7 @@ Type* SafeArrayAlloc(unsigned long long num_elements, void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; minor = 1; - build = 0; + build = 1; revision = 0; } @@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size, if (status < 0) return status; - unsigned long long result = first_byte; + unsigned long long result = static_cast<unsigned long long>(first_byte); ++pos; for (long i = 1; i < size; ++i) { @@ -2432,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, pos += size; // consume payload } - if ((m_pos < 0) || (m_track <= 0)) { + if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) { return false; } diff --git a/tools_common.c b/tools_common.c index cbecfbb41..5c1378151 100644 --- a/tools_common.c +++ b/tools_common.c @@ -24,6 +24,8 @@ #include "vpx/vp8dx.h" #endif +#include "vpx/vpx_codec.h" + #if defined(_WIN32) || defined(__OS2__) #include <io.h> #include <fcntl.h> @@ -77,8 +79,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } void die_codec(vpx_codec_ctx_t *ctx, const char *s) { const char *detail = vpx_codec_error_detail(ctx); - printf("%s: %s\n", s, vpx_codec_error(ctx)); - if (detail) printf(" %s\n", detail); + fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx)); + if (detail) fprintf(stderr, " %s\n", detail); exit(EXIT_FAILURE); } @@ -375,7 +377,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -411,7 +413,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -452,7 +454,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -487,7 +489,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -521,7 +523,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; diff --git a/tools_common.h b/tools_common.h index b9cfb9cc8..e2942d04b 100644 --- a/tools_common.h +++ b/tools_common.h @@ -32,7 +32,12 @@ typedef int64_t FileOffset; #define fseeko fseeko64 #define ftello ftello64 typedef off64_t FileOffset; -#elif CONFIG_OS_SUPPORT +#elif CONFIG_OS_SUPPORT && \ + !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) +/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + * Android API level 24. See + * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */ #include <sys/types.h> /* NOLINT */ typedef off_t FileOffset; /* Use 32-bit file operations in WebM file format when building ARM @@ -145,9 +150,9 @@ VPX_NO_RETURN void usage_exit(void); int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); typedef struct VpxInterface { - const char *const name; - const uint32_t fourcc; - vpx_codec_iface_t *(*const codec_interface)(); + const char *name; + uint32_t fourcc; + vpx_codec_iface_t *(*codec_interface)(void); } VpxInterface; int get_vpx_encoder_count(void); diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index 48e86d327..ee3c281f0 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -16,7 +16,7 @@ #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { - { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */ + { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ { 0, -9, 93, 50, -6, 0, 0, 0 }, @@ -781,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d8u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d9u8); - return; } void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, @@ -1250,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d9u8); dst_ptr += dst_pitch; } - return; } void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, @@ -1504,7 +1502,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, src += src_pixels_per_line; d12u8 = vld1_u8(src); d13u8 = vld1_u8(src + 8); - d14u8 = vld1_u8(src + 16); + // Only 5 pixels are needed, avoid a potential out of bounds read. + d14u8 = vld1_u8(src + 13); + d14u8 = vext_u8(d14u8, d14u8, 3); src += src_pixels_per_line; __builtin_prefetch(src); @@ -1726,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, dst += dst_pitch; } } - return; } diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 405443449..8300aad94 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -251,7 +251,7 @@ typedef struct macroblockd { unsigned char update_mb_segmentation_data; /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char mb_segement_abs_delta; + unsigned char mb_segment_abs_delta; /* Per frame flags that define which MB level features (such as quantizer or * loop filter level) */ diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c index cd7ba5474..986763341 100644 --- a/vp8/common/loongarch/sixtap_filter_lsx.c +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -1706,21 +1706,22 @@ void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, switch (xoffset) { case 0: { __m128i tp0; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 1); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 2); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 3); + tp0 = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tp0, dst, 0, 0); + break; } case 2: @@ -1865,7 +1866,7 @@ void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride, case 1: Predict16x16Funcs1[3](src, src_stride, dst, dst_stride, - h_filter, v_filter + 1, 16); + h_filter + 1, v_filter + 1, 16); break; } break; diff --git a/vp8/common/mips/msa/sixtap_filter_msa.c b/vp8/common/mips/msa/sixtap_filter_msa.c index b0affcff0..3a1bb7cd5 100644 --- a/vp8/common/mips/msa/sixtap_filter_msa.c +++ b/vp8/common/mips/msa/sixtap_filter_msa.c @@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ filt_h2) \ ({ \ - v16i8 vec0_m, vec1_m, vec2_m; \ - v8i16 hz_out_m; \ + v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \ + v8i16 _6tap_out_m; \ \ VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ - vec0_m, vec1_m, vec2_m); \ - hz_out_m = \ - DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ + _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \ + _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \ + filt_h0, filt_h1, filt_h2); \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \ + _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \ \ - hz_out_m; \ + _6tap_out_m; \ }) #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1) \ { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \ \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \ + _6tap_4wid_vec1_m); \ + DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m); \ + DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \ + _6tap_4wid_vec5_m); \ + DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \ + out1); \ } -#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, filt0, filt1, filt2, out0, out1, \ - out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ - out0, out1, out2, out3); \ +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + { \ + v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \ + _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \ + _6tap_8wid_vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m); \ + DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ + DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \ + out2, out3); \ } -#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ - ({ \ - v8i16 tmp0; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - \ - tmp0; \ - }) - -#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ ({ \ - v16i8 vec0_m, vec1_m; \ - v8i16 hz_out_m; \ - \ - VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ - hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ + v8i16 _4tap_dpadd_tmp0; \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + _4tap_dpadd_tmp0 = \ + __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \ \ - hz_out_m; \ + _4tap_dpadd_tmp0; \ }) -#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ + ({ \ + v16i8 _4tap_vec0_m, _4tap_vec1_m; \ + v8i16 _4tap_out_m; \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \ + _4tap_vec1_m); \ + _4tap_out_m = \ + FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \ + \ + _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \ + _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \ + \ + _4tap_out_m; \ + }) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + { \ + v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \ + _4tap_4wid_vec1_m); \ + DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m); \ + DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ } -#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1, out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + { \ + v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ } static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 7cb3c9869..cc85b9a1f 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -40,160 +40,160 @@ #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile("lw %[lw_val_m], %[lw_psrc_m] \n\t" \ + \ + : [lw_val_m] "=r"(lw_val_m) \ + : [lw_psrc_m] "m"(*lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile("ld %[ld_val_m], %[ld_psrc_m] \n\t" \ + \ + : [ld_val_m] "=r"(ld_val_m) \ + : [ld_psrc_m] "m"(*ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_ld = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_ld); \ - val1_m = LW(psrc_ld + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m); \ + ld_val1_m = LW(ld_psrc_m + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("sh %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("sw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - asm volatile("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m = (uint8_t *)(pdst); \ + const uint64_t sd_val_m = (val); \ + \ + asm volatile("sd %[sd_val_m], %[sd_pdst_m] \n\t" \ + \ + : [sd_pdst_m] "=m"(*sd_pdst_m) \ + : [sd_val_m] "r"(sd_val_m)); \ } #else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile( \ - "lwr %[val_m], 0(%[psrc_m]) \n\t" \ - "lwl %[val_m], 3(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile( \ + "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t" \ + "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t" \ + : [lw_val_m] "=&r"(lw_val_m) \ + : [lw_psrc_m] "r"(lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile( \ - "ldr %[val_m], 0(%[psrc_m]) \n\t" \ - "ldl %[val_m], 7(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile( \ + "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t" \ + "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t" \ + : [ld_val_m] "=&r"(ld_val_m) \ + : [ld_psrc_m] "r"(ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m1); \ + ld_val1_m = LW(ld_psrc_m1 + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("ush %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("usw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m1 = (uint8_t *)(pdst); \ + uint32_t sd_val0_m, sd_val1_m; \ + \ + sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(sd_val0_m, sd_pdst_m1); \ + SW(sd_val1_m, sd_pdst_m1 + 4); \ } #endif // (__mips_isa_rev >= 6) diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 05c72df3f..1b70ea5db 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -27,13 +27,6 @@ struct VP8_COMP; /* Create/destroy static data structures. */ typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - -typedef enum { USAGE_LOCAL_FILE_PLAYBACK = 0x0, USAGE_STREAM_FROM_SERVER = 0x1, USAGE_CONSTRAINED_QUALITY = 0x2, @@ -58,19 +51,19 @@ typedef enum { #include <assert.h> static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; - case ONETWO: + case VP8E_ONETWO: *hr = 1; *hs = 2; break; @@ -90,7 +83,14 @@ typedef struct { int Width; int Height; struct vpx_rational timebase; - unsigned int target_bandwidth; /* kilobits per second */ + /* In either kilobits per second or bits per second, depending on which + * copy of oxcf this is in. + * - ctx->oxcf.target_bandwidth is in kilobits per second. See + * set_vp8e_config(). + * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See + * vp8_change_config(). + */ + unsigned int target_bandwidth; /* Parameter used for applying denoiser. * For temporal denoiser: noise_sensitivity = 0 means off, @@ -221,6 +221,7 @@ typedef struct { /* Temporal scaling parameters */ unsigned int number_of_layers; + /* kilobits per second */ unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY]; unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY]; unsigned int periodicity; @@ -243,11 +244,11 @@ typedef struct { void vp8_initialize(); -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); -void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf); int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, @@ -273,8 +274,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int threshold[4]); int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 739a61284..12b474d93 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -127,12 +127,6 @@ specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; # if (vpx_config("CONFIG_POSTPROC") eq "yes") { - add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - - add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - - add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; specialize qw/vp8_filter_by_weight16x16 sse2 msa/; diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c index 9c9e5f351..4576c1853 100644 --- a/vp8/common/vp8_loopfilter.c +++ b/vp8/common/vp8_loopfilter.c @@ -111,7 +111,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, /* Note the baseline filter values for each segment */ if (mbd->segmentation_enabled) { - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) { lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; } else { /* Delta Value */ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 1c1566766..af9a98c1d 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) { /* Decide whether to use the default or alternate baseline Q value. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; /* Delta Value */ @@ -829,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) { /* reset the segment feature data to 0 with delta coding (Default state). */ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); - xd->mb_segement_abs_delta = SEGMENT_DELTADATA; + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; /* reset the mode ref deltasa for loop filter */ memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); @@ -995,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc); if (xd->update_mb_segmentation_data) { - xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc); + xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc); memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); @@ -1167,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1; #endif - if (0) { - FILE *z = fopen("decodestats.stt", "a"); - fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame, - pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame, - pc->refresh_last_frame, pc->base_qindex); - fclose(z); - } - { pbi->independent_partitions = 1; diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index a6bedc4fa..56500a850 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -135,27 +135,6 @@ int vp8_decode_frame(VP8D_COMP *pbi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); int vp8_remove_decoder_instances(struct frame_buffers *fb); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(pbi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(pbi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 490f62d1b..6ccb080cf 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -30,11 +30,13 @@ #include "error_concealment.h" #endif -#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n))) -#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ - do { \ - CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \ - memset((p), 0, (n) * sizeof(*(p))); \ +#define CALLOC_ARRAY(p, n) \ + CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n))) +#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ + do { \ + CHECK_MEM_ERROR(&pbi->common.error, (p), \ + vpx_memalign((algn), sizeof(*(p)) * (n))); \ + memset((p), 0, (n) * sizeof(*(p))); \ } while (0) static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -54,7 +56,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, mbd->dst = xd->dst; mbd->segmentation_enabled = xd->segmentation_enabled; - mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); @@ -754,7 +756,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; /* Allocate a vpx_atomic_int for each mb row. */ - CHECK_MEM_ERROR(pbi->mt_current_mb_col, + CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col, vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); for (i = 0; i < pc->mb_rows; ++i) vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); @@ -762,7 +764,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1)); @@ -770,7 +772,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS); @@ -778,7 +780,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS); @@ -787,17 +789,17 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1)); CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); } } diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index 6fc60805f..950c94334 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; -#ifndef __aarch64__ +#if !VPX_ARCH_AARCH64 uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; -#endif // __arch64__ +#endif // !VPX_ARCH_AARCH64 /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *d->eob = (int8_t)vmaxvq_u16(eob0); #else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); @@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { eob_d32 = vpmax_u32(eob_d32, eob_d32); vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 190b013af..03691fc9d 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -1080,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (xd->update_mb_segmentation_data) { signed char Data; - vp8_write_bit(bc, xd->mb_segement_abs_delta); + vp8_write_bit(bc, xd->mb_segment_abs_delta); /* For each segmentation feature (Quant and loop filter level) */ for (i = 0; i < MB_LVL_MAX; ++i) { diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index e54d1e9f4..a666bca4d 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, // When adopting aggressive denoiser, the adj_val for each pixel // could be at most 8 (this is current max adjustment of the map). // In SSE code, we calculate the sum of adj_val for - // the columns, so the sum could be upto 128(16 rows). However, + // the columns, so the sum could be up to 128(16 rows). However, // the range of the value is -128 ~ 127 in SSE code, that's why // we do this change in C code. // We don't do this for UV denoiser, since there are only 8 rows, diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 620107500..5c973940e 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <stdio.h> +#include <limits.h> #include "vpx_config.h" #include "vp8_rtcd.h" @@ -29,9 +31,9 @@ #include "rdopt.h" #include "pickinter.h" #include "vp8/common/findnearmv.h" -#include <stdio.h> -#include <limits.h> #include "vp8/common/invtrans.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING #include "bitstream.h" @@ -123,7 +125,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) { unsigned int tmp; /* Create a list to sort to */ - CHECK_MEM_ERROR(sortlist, + CHECK_MEM_ERROR(&cpi->common.error, sortlist, vpx_calloc(sizeof(unsigned int), cpi->common.MBs)); /* Copy map to sort list */ @@ -750,6 +752,15 @@ void vp8_encode_frame(VP8_COMP *cpi) { vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); + if (cpi->mt_current_mb_col_size != cm->mb_rows) { + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mt_current_mb_col, + vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + cpi->mt_current_mb_col_size = cm->mb_rows; + } for (i = 0; i < cm->mb_rows; ++i) vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index cb35f4f49..e2f8b89d4 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <stddef.h> #include "onyx_int.h" #include "vp8/common/threading.h" @@ -402,7 +403,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { zd->subpixel_predict8x8 = xd->subpixel_predict8x8; zd->subpixel_predict16x16 = xd->subpixel_predict16x16; zd->segmentation_enabled = xd->segmentation_enabled; - zd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + zd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); @@ -487,15 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - - vpx_atomic_init(&cpi->b_multi_threaded, 0); - cpi->encoding_thread_count = 0; - cpi->b_lpf_running = 0; + int th_count = 0; if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { - int ithread; - int th_count = cpi->oxcf.multi_threaded - 1; - int rc = 0; + th_count = cpi->oxcf.multi_threaded - 1; /* don't allocate more threads than cores available */ if (cpi->oxcf.multi_threaded > cm->processor_core_count) { @@ -507,19 +503,24 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) { th_count = (cm->mb_cols / cpi->mt_sync_range) - 1; } + } + if (th_count == cpi->encoding_thread_count) return 0; - if (th_count == 0) return 0; + vp8cx_remove_encoder_threads(cpi); + if (th_count != 0) { + int ithread; + int rc = 0; - CHECK_MEM_ERROR(cpi->h_encoding_thread, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_start_encoding, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_end_encoding, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->mb_row_ei, + CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); - CHECK_MEM_ERROR(cpi->en_thread_data, + CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); vpx_atomic_store_release(&cpi->b_multi_threaded, 1); @@ -553,6 +554,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { + sem_post(&cpi->h_event_start_encoding[ithread]); + sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); sem_destroy(&cpi->h_event_end_encoding[ithread]); @@ -560,10 +563,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -1; } @@ -592,10 +601,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -2; } @@ -627,13 +642,23 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { sem_destroy(&cpi->h_event_end_lpf); sem_destroy(&cpi->h_event_start_lpf); + cpi->b_lpf_running = 0; /* free thread related resources */ + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; } } #endif diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 65d2681c9..4443f5e7c 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -412,7 +412,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int_mv ref_mv_full; int tmp_err; - int step_param = 3; /* Dont search over full range for first pass */ + int step_param = 3; /* Don't search over full range for first pass */ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; @@ -821,22 +821,6 @@ void vp8_first_pass(VP8_COMP *cpi) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); } - /* use this to see what the first pass reconstruction looks like */ - if (0) { - char filename[512]; - FILE *recon_file; - sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) { - recon_file = fopen(filename, "wb"); - } else { - recon_file = fopen(filename, "ab"); - } - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - cm->current_video_frame++; } extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; @@ -1038,12 +1022,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, double clip_iifactor; int overhead_bits_per_mb; - if (0) { - FILE *f = fopen("epmp.stt", "a"); - fprintf(f, "%10.2f\n", err_per_mb); - fclose(f); - } - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); @@ -1230,17 +1208,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, Q++; } - if (0) { - FILE *f = fopen("estkf_q.stt", "a"); - fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", - cpi->common.current_video_frame, bits_per_mb_at_this_q, - target_norm_bits_per_mb, err_per_mb, err_correction_factor, - current_spend_ratio, group_iiratio, iiratio_correction_factor, - (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, - Q); - fclose(f); - } - return Q; } @@ -1537,7 +1504,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = @@ -1581,7 +1548,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = @@ -1717,9 +1684,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Break at cpi->max_gf_interval unless almost totally static */ (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) || ( - /* Dont break out with a very short interval */ + /* Don't break out with a very short interval */ (i > MIN_GF_INTERVAL) && - /* Dont break out very close to a key frame */ + /* Don't break out very close to a key frame */ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && (!flash_detected) && @@ -1765,7 +1732,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (boost_score > max_boost) boost_score = max_boost; } - /* Dont allow conventional gf too near the next kf */ + /* Don't allow conventional gf too near the next kf */ if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { while (i < cpi->twopass.frames_to_key) { i++; @@ -1786,9 +1753,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); #endif - /* Should we use the alternate refernce frame */ + /* Should we use the alternate reference frame */ if (allow_alt_ref && (i >= MIN_GF_INTERVAL) && - /* dont use ARF very near next kf */ + /* don't use ARF very near next kf */ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && #if NEW_BOOST ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) && @@ -2082,7 +2049,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } - /* Dont allow a negative value for gf_bits */ + /* Don't allow a negative value for gf_bits */ if (gf_bits < 0) gf_bits = 0; /* Add in minimum for a frame */ @@ -2123,7 +2090,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; /* This condition could fail if there are two kfs very close together - * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the + * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the * calculation of cpi->twopass.alt_extra_bits. */ if (cpi->baseline_gf_interval >= 3) { @@ -2393,7 +2360,7 @@ void vp8_second_pass(VP8_COMP *cpi) { } /* The last few frames of a clip almost always have to few or too many - * bits and for the sake of over exact rate control we dont want to make + * bits and for the sake of over exact rate control we don't want to make * radical adjustments to the allowed quantizer range just to use up a * few surplus bits or get beneath the target rate. */ @@ -2990,8 +2957,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* Set back to unscaled by defaults */ - cpi->common.horiz_scale = NORMAL; - cpi->common.vert_scale = NORMAL; + cpi->common.horiz_scale = VP8E_NORMAL; + cpi->common.vert_scale = VP8E_NORMAL; /* Calculate Average bits per frame. */ av_bits_per_frame = cpi->oxcf.target_bandwidth / @@ -3011,7 +2978,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { bits_per_frame = (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key); - /* Dont turn to resampling in easy sections just because they + /* Don't turn to resampling in easy sections just because they * have been assigned a small number of bits */ if (bits_per_frame < av_bits_per_frame) { @@ -3047,16 +3014,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (int)((projected_bits_perframe - av_bits_per_frame) * cpi->twopass.frames_to_key)); - if (0) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", - cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, - cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } - /* The trigger for spatial resampling depends on the various * parameters such as whether we are streaming (CBR) or VBR. */ @@ -3120,17 +3077,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { */ kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, (int)bits_per_frame, group_iiratio); - - if (0) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf( - f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, - cpi->common.horiz_scale, cpi->common.vert_scale, - kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } } } diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index b92e2135e..bc150e482 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1123,8 +1123,8 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); @@ -1441,8 +1441,8 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c index 011b62a08..b1bfb4b54 100644 --- a/vp8/encoder/mr_dissim.c +++ b/vp8/encoder/mr_dissim.c @@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) { void vp8_cal_dissimilarity(VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; - int i; /* Note: The first row & first column in mip are outside the frame, which * were initialized to all 0.(ref_frame, mode, mv...) @@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) { store_info->frame_type = cm->frame_type; if (cm->frame_type != KEY_FRAME) { + int i; store_info->is_frame_dropped = 0; for (i = 1; i < MAX_REF_FRAMES; ++i) store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i]; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 148a16cc4..4e128e3c4 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -270,7 +270,7 @@ static int rescale(int val, int num, int denom) { return (int)(llval * llnum / llden); } -void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; @@ -328,7 +328,7 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, // for any "new" layers. For "existing" layers, let them inherit the parameters // from the previous layer state (at the same layer #). In future we may want // to better map the previous layer state(s) to the "new" ones. -void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int prev_num_layers) { int i; double prev_layer_framerate = 0; @@ -442,11 +442,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { vpx_free(cpi->mb.pip); cpi->mb.pip = 0; - -#if CONFIG_MULTITHREAD - vpx_free(cpi->mt_current_mb_col); - cpi->mt_current_mb_col = NULL; -#endif } static void enable_segmentation(VP8_COMP *cpi) { @@ -488,7 +483,7 @@ static void set_segmentation_map(VP8_COMP *cpi, */ static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta) { - cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta; + cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta; memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data)); } @@ -1169,7 +1164,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { #else unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16; #endif - CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tok, + vpx_calloc(tokens, sizeof(*cpi->tok))); } /* Data used for real time vc mode to see if gf needs refreshing */ @@ -1178,37 +1174,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { /* Structures used to monitor GF usage */ vpx_free(cpi->gf_active_flags); CHECK_MEM_ERROR( - cpi->gf_active_flags, + &cpi->common.error, cpi->gf_active_flags, vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; vpx_free(cpi->mb_activity_map); CHECK_MEM_ERROR( - cpi->mb_activity_map, + &cpi->common.error, cpi->mb_activity_map, vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols)); /* allocate memory for storing last frame's MVs for MV prediction. */ vpx_free(cpi->lfmv); - CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), - sizeof(*cpi->lfmv))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->lfmv, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv))); vpx_free(cpi->lf_ref_frame_sign_bias); - CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame_sign_bias))); vpx_free(cpi->lf_ref_frame); - CHECK_MEM_ERROR(cpi->lf_ref_frame, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame))); /* Create the encoder segmentation map and set all entries to 0 */ vpx_free(cpi->segmentation_map); CHECK_MEM_ERROR( - cpi->segmentation_map, + &cpi->common.error, cpi->segmentation_map, vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map))); cpi->cyclic_refresh_mode_index = 0; vpx_free(cpi->active_map); - CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols, - sizeof(*cpi->active_map))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->active_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map))); memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols)); #if CONFIG_MULTITHREAD @@ -1221,21 +1219,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { } else { cpi->mt_sync_range = 16; } - - if (cpi->oxcf.multi_threaded > 1) { - int i; - - vpx_free(cpi->mt_current_mb_col); - CHECK_MEM_ERROR(cpi->mt_current_mb_col, - vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); - for (i = 0; i < cm->mb_rows; ++i) - vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); - } - #endif vpx_free(cpi->tplist); - CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist, + vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { @@ -1302,7 +1290,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) { } } -static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; cpi->oxcf = *oxcf; @@ -1424,11 +1412,11 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) { } } -void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; int last_w, last_h; unsigned int prev_number_of_layers; - unsigned int raw_target_rate; + double raw_target_rate; if (!cpi) return; @@ -1443,11 +1431,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { last_h = cpi->oxcf.Height; prev_number_of_layers = cpi->oxcf.number_of_layers; - if (cpi->initial_width) { - // TODO(https://crbug.com/1486441): Allow changing thread counts; the - // allocation is done once in vp8_create_compressor(). - oxcf->multi_threaded = cpi->oxcf.multi_threaded; - } cpi->oxcf = *oxcf; switch (cpi->oxcf.Mode) { @@ -1574,10 +1557,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->oxcf.maximum_buffer_size_in_ms = 240000; } - raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * - 8 * 3 * cpi->framerate / 1000); + raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 * + cpi->framerate / 1000.0); if (cpi->oxcf.target_bandwidth > raw_target_rate) - cpi->oxcf.target_bandwidth = raw_target_rate; + cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate; /* Convert target bandwidth from Kbit/s to Bit/s */ cpi->oxcf.target_bandwidth *= 1000; @@ -1672,7 +1655,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { + if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) { int hr, hs, vr, vs; Scale2Ratio(cm->horiz_scale, &hr, &hs); @@ -1756,7 +1739,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) { } while (++i <= mvfp_max); } -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { int i; VP8_COMP *cpi; @@ -1778,8 +1761,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.error.setjmp = 1; - CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), - (MAX_MVSEARCH_STEPS * 8) + 1)); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mb.ss, + vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); vp8_create_common(&cpi->common); @@ -1884,18 +1868,19 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { } if (cpi->cyclic_refresh_mode_enabled) { - CHECK_MEM_ERROR(cpi->cyclic_refresh_map, + CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); } else { cpi->cyclic_refresh_map = (signed char *)NULL; } - CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols, - sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->skin_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0]))); - CHECK_MEM_ERROR(cpi->consec_zero_last, + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last, vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); - CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); /*Initialize the feed-forward activity masking.*/ @@ -1914,6 +1899,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->force_maxqp = 0; cpi->frames_since_last_drop_overshoot = 0; cpi->rt_always_update_correction_factor = 0; + cpi->rt_drop_recode_on_overshoot = 1; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -2114,7 +2100,6 @@ void vp8_remove_compressor(VP8_COMP **comp) { double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; if (cpi->b_calculate_psnr) { if (cpi->oxcf.number_of_layers > 1) { @@ -2143,6 +2128,7 @@ void vp8_remove_compressor(VP8_COMP **comp) { total_psnr2, total_ssim); } } else { + double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; double samples = 3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height; double total_psnr = @@ -2250,7 +2236,7 @@ void vp8_remove_compressor(VP8_COMP **comp) { #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000); } #endif @@ -2509,15 +2495,17 @@ static int resize_key_frame(VP8_COMP *cpi) { if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO; - cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO; + (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO; + cm->vert_scale = + (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO; } /* Should we now start scaling back up */ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL; - cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; + (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL; + cm->vert_scale = + (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL; } /* Get the new height and width */ @@ -2748,7 +2736,7 @@ static int decide_key_frame(VP8_COMP *cpi) { } /* in addition if the following are true and this is not a golden frame * then code a key frame Note that on golden frames there often seems - * to be a pop in intra useage anyway hence this restriction is + * to be a pop in intra usage anyway hence this restriction is * designed to prevent spurious key frames. The Intra pop needs to be * investigated. */ @@ -3196,6 +3184,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { vp8_yv12_extend_frame_borders(cm->frame_to_show); } +// Return 1 if frame is to be dropped. Update frame drop decimation +// counters. +int vp8_check_drop_buffer(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * + cpi->oxcf.optimal_buffer_level / 100); + int drop_mark75 = drop_mark * 2 / 3; + int drop_mark50 = drop_mark / 4; + int drop_mark25 = drop_mark / 8; + if (cpi->drop_frames_allowed) { + /* The reset to decimation 0 is only done here for one pass. + * Once it is set two pass leaves decimation on till the next kf. + */ + if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) { + cpi->decimation_factor--; + } + + if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { + cpi->decimation_factor = 1; + + } else if (cpi->buffer_level < drop_mark25 && + (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { + cpi->decimation_factor = 3; + } else if (cpi->buffer_level < drop_mark50 && + (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { + cpi->decimation_factor = 2; + } else if (cpi->buffer_level < drop_mark75 && + (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { + cpi->decimation_factor = 1; + } + } + + /* The following decimates the frame rate according to a regular + * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help + * prevent buffer under-run in CBR mode. Alternatively it might be + * desirable in some situations to drop frame rate but throw more bits + * at each frame. + * + * Note that dropping a key frame can be problematic if spatial + * resampling is also active + */ + if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { + switch (cpi->decimation_factor) { + case 1: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; + break; + case 2: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + case 3: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + } + + /* Note that we should not throw out a key frame (especially when + * spatial resampling is enabled). + */ + if (cm->frame_type == KEY_FRAME) { + cpi->decimation_count = cpi->decimation_factor; + } else if (cpi->decimation_count > 0) { + cpi->decimation_count--; + + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + } + +#if CONFIG_MULTI_RES_ENCODING + vp8_store_drop_frame_info(cpi); +#endif + + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + +#if CONFIG_INTERNAL_STATS + cpi->count++; +#endif + + cpi->buffer_level = cpi->bits_off_target; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + /* Propagate bits saved by dropping the frame to higher + * layers + */ + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + lc->buffer_level = lc->bits_off_target; + } + } + return 1; + } else { + cpi->decimation_count = cpi->decimation_factor; + } + } else { + cpi->decimation_count = 0; + } + return 0; +} static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -3206,7 +3301,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int frame_under_shoot_limit; int Loop = 0; - int loop_count; VP8_COMMON *cm = &cpi->common; int active_worst_qchanged = 0; @@ -3222,12 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int undershoot_seen = 0; #endif - int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * - cpi->oxcf.optimal_buffer_level / 100); - int drop_mark75 = drop_mark * 2 / 3; - int drop_mark50 = drop_mark / 4; - int drop_mark25 = drop_mark / 8; - /* Clear down mmx registers to allow floating point in what follows */ vpx_clear_system_state(); @@ -3441,102 +3529,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_rd_ref_frame_probs(cpi); - if (cpi->drop_frames_allowed) { - /* The reset to decimation 0 is only done here for one pass. - * Once it is set two pass leaves decimation on till the next kf. - */ - if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) { - cpi->decimation_factor--; - } - - if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { - cpi->decimation_factor = 1; - - } else if (cpi->buffer_level < drop_mark25 && - (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { - cpi->decimation_factor = 3; - } else if (cpi->buffer_level < drop_mark50 && - (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { - cpi->decimation_factor = 2; - } else if (cpi->buffer_level < drop_mark75 && - (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { - cpi->decimation_factor = 1; - } - } - - /* The following decimates the frame rate according to a regular - * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help - * prevent buffer under-run in CBR mode. Alternatively it might be - * desirable in some situations to drop frame rate but throw more bits - * at each frame. - * - * Note that dropping a key frame can be problematic if spatial - * resampling is also active - */ - if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { - switch (cpi->decimation_factor) { - case 1: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; - break; - case 2: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - case 3: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - } - - /* Note that we should not throw out a key frame (especially when - * spatial resampling is enabled). - */ - if (cm->frame_type == KEY_FRAME) { - cpi->decimation_count = cpi->decimation_factor; - } else if (cpi->decimation_count > 0) { - cpi->decimation_count--; - - cpi->bits_off_target += cpi->av_per_frame_bandwidth; - if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { - cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - } - -#if CONFIG_MULTI_RES_ENCODING - vp8_store_drop_frame_info(cpi); -#endif - - cm->current_video_frame++; - cpi->frames_since_key++; - cpi->ext_refresh_frame_flags_pending = 0; - // We advance the temporal pattern for dropped frames. - cpi->temporal_pattern_counter++; - -#if CONFIG_INTERNAL_STATS - cpi->count++; -#endif - - cpi->buffer_level = cpi->bits_off_target; - - if (cpi->oxcf.number_of_layers > 1) { - unsigned int i; - - /* Propagate bits saved by dropping the frame to higher - * layers - */ - for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { - LAYER_CONTEXT *lc = &cpi->layer_context[i]; - lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); - if (lc->bits_off_target > lc->maximum_buffer_size) { - lc->bits_off_target = lc->maximum_buffer_size; - } - lc->buffer_level = lc->bits_off_target; - } - } - - return; - } else { - cpi->decimation_count = cpi->decimation_factor; - } - } else { - cpi->decimation_count = 0; + if (vp8_check_drop_buffer(cpi)) { + return; } /* Decide how big to make the frame */ @@ -3635,7 +3629,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, Q = cpi->avg_frame_qindex; } - /* For constrained quality dont allow Q less than the cq level */ + /* For constrained quality don't allow Q less than the cq level */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && (Q < cpi->cq_target_quality)) { Q = cpi->cq_target_quality; @@ -3662,7 +3656,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } else { cpi->active_best_quality = inter_minq[Q]; - /* For the constant/constrained quality mode we dont want + /* For the constant/constrained quality mode we don't want * q to fall below the cq level. */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && @@ -3683,7 +3677,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * higher quality on the frames to prevent bits just going to waste. */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - /* Note that the use of >= here elliminates the risk of a devide + /* Note that the use of >= here elliminates the risk of a divide * by 0 error in the else if clause */ if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) { @@ -3772,8 +3766,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_save_coding_context(cpi); - loop_count = 0; - scale_and_extend_source(cpi->un_scaled_source, cpi); #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC @@ -3946,7 +3938,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->rt_drop_recode_on_overshoot == 1) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) { vpx_clear_system_state(); return; @@ -3996,7 +3989,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, q_low = cpi->active_best_quality; q_high = cpi->active_worst_quality; - loop_count++; Loop = 1; continue; @@ -4222,7 +4214,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (Loop == 1) { vp8_restore_coding_context(cpi); - loop_count++; #if CONFIG_INTERNAL_STATS cpi->tot_recode_hits++; #endif @@ -4324,12 +4315,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_cal_dissimilarity(cpi); #endif - /* Update the GF useage maps. + /* Update the GF usage maps. * This is done after completing the compression of a frame when all * modes etc. are finalized but before loop filter */ if (cpi->oxcf.number_of_layers == 1) { - vp8_update_gf_useage_maps(cpi, cm, &cpi->mb); + vp8_update_gf_usage_maps(cpi, cm, &cpi->mb); } if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1; @@ -4486,7 +4477,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * size within range) then use the last frame value - 1. The -1 * is designed to stop Q and hence the data rate, from * progressively falling away during difficult sections, but at - * the same time reduce the number of itterations around the + * the same time reduce the number of iterations around the * recode loop. */ if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1; @@ -4733,7 +4724,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->mb.e_mbd.update_mb_segmentation_data = 0; cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; - /* Dont increment frame counters if this was an altref buffer update + /* Don't increment frame counters if this was an altref buffer update * not a real frame */ if (cm->show_frame) { @@ -5111,7 +5102,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc)); - /* if its a dropped frame honor the requests on subsequent frames */ + /* if it's a dropped frame honor the requests on subsequent frames */ if (*size > 0) { cpi->droppable = !frame_is_reference(cpi); @@ -5385,15 +5376,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, } } -int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { - if (horiz_mode <= ONETWO) { +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + if (horiz_mode <= VP8E_ONETWO) { cpi->common.horiz_scale = horiz_mode; } else { return -1; } - if (vert_mode <= ONETWO) { + if (vert_mode <= VP8E_ONETWO) { cpi->common.vert_scale = vert_mode; } else { return -1; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 46a17913a..1451a2781 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -215,7 +215,7 @@ enum { typedef struct { /* Layer configuration */ double framerate; - int target_bandwidth; + int target_bandwidth; /* bits per second */ /* Layer specific coding parameters */ int64_t starting_buffer_level; @@ -360,7 +360,7 @@ typedef struct VP8_COMP { /* GF interval chosen when we coded the last GF */ int current_gf_interval; - /* Total bits overspent becasue of GF boost (cumulative) */ + /* Total bits overspent because of GF boost (cumulative) */ int gf_overspend_bits; /* Used in the few frames following a GF to recover the extra bits @@ -438,7 +438,7 @@ typedef struct VP8_COMP { int kf_boost; int last_boost; - int target_bandwidth; + int target_bandwidth; /* bits per second */ struct vpx_codec_pkt_list *output_pkt_list; #if 0 @@ -526,6 +526,7 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ vpx_atomic_int *mt_current_mb_col; + int mt_current_mb_col_size; int mt_sync_range; vpx_atomic_int b_multi_threaded; int encoding_thread_count; @@ -707,15 +708,19 @@ typedef struct VP8_COMP { // Always update correction factor used for rate control after each frame for // realtime encoding. int rt_always_update_correction_factor; + + // Flag to indicate frame may be dropped due to large expected overshoot, + // and re-encoded on next frame at max_qp. + int rt_drop_recode_on_overshoot; } VP8_COMP; void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); -void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int prev_num_layers); -void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate); void vp8_update_layer_contexts(VP8_COMP *cpi); @@ -731,26 +736,8 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); void vp8_set_speed_features(VP8_COMP *cpi); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(cpi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(cpi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif +int vp8_check_drop_buffer(VP8_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 04f68c324..1af8a2f9b 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -1103,7 +1103,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { /* Store for later use by denoiser. */ - // Dont' denoise with GOLDEN OR ALTREF is they are old reference + // Don't denoise with GOLDEN OR ALTREF is they are old reference // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past). int skip_old_reference = ((this_ref_frame != LAST_FRAME) && (cpi->common.current_video_frame - diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 9cd3963e2..fcd4eb04e 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -346,7 +346,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { /* Minimal target size is |2* per_frame_bandwidth|. */ if (kf_boost < 16) kf_boost = 16; - target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = VPXMIN(INT_MAX, target); } if (cpi->oxcf.rc_max_intra_bitrate_pct) { @@ -388,7 +389,7 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; int Boost = 0; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -398,12 +399,12 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Not two pass */ if (cpi->pass != 2) { @@ -467,7 +468,7 @@ static void calc_gf_params(VP8_COMP *cpi) { /* Adjust boost based upon ambient Q */ Boost = GFQ_ADJUSTMENT; - /* Adjust based upon most recently measure intra useage */ + /* Adjust based upon most recently measure intra usage */ Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra @@ -475,7 +476,7 @@ static void calc_gf_params(VP8_COMP *cpi) { 100; /* Adjust gf boost based upon GF usage since last GF */ - Boost = Boost * gf_adjust_table[gf_frame_useage] / 100; + Boost = Boost * gf_adjust_table[gf_frame_usage] / 100; #endif } @@ -516,8 +517,8 @@ static void calc_gf_params(VP8_COMP *cpi) { if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++; - if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) { - cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage]; + if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) { + cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage]; } if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) { @@ -718,7 +719,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { } /* lower the target bandwidth for this frame. */ - cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200; + cpi->this_frame_target -= + (int)(((int64_t)cpi->this_frame_target * percent_low) / 200); /* Are we using allowing control of active_worst_allowed_q * according to buffer level. @@ -895,7 +897,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -905,20 +907,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Is a fixed manual GF frequency being used */ if (cpi->auto_gold) { - /* For one pass throw a GF if recent frame intra useage is - * low or the GF useage is high + /* For one pass throw a GF if recent frame intra usage is + * low or the GF usage is high */ if ((cpi->pass == 0) && - (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { + (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) { cpi->common.refresh_golden_frame = 1; /* Two pass GF descision */ @@ -933,10 +935,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { if (0) { FILE *f; - f = fopen("gf_useaget.stt", "a"); + f = fopen("gf_usaget.stt", "a"); fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->gfu_boost, - GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage); fclose(f); } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index bbddacf8f..5d539ef30 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1021,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, BLOCK *c; BLOCKD *e; - /* Is the best so far sufficiently good that we cant justify + /* Is the best so far sufficiently good that we can't justify * doing a new motion search. */ if (best_label_rd < label_mv_thresh) break; @@ -1979,7 +1979,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; /* If even the 'Y' rd value of split is higher than best so far - * then dont bother looking at UV + * then don't bother looking at UV */ if (tmp_rd < best_mode.yrd) { /* Now work out UV cost and add it in */ diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c index dcb68119e..212725811 100644 --- a/vp8/encoder/segmentation.c +++ b/vp8/encoder/segmentation.c @@ -11,7 +11,7 @@ #include "segmentation.h" #include "vpx_mem/vpx_mem.h" -void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { +void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { int mb_row, mb_col; MODE_INFO *this_mb_mode_info = cm->mi; @@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { x->gf_active_ptr = (signed char *)cpi->gf_active_flags; if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { - /* Reset Gf useage monitors */ + /* Reset Gf usage monitors */ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; } else { diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h index 4ddbdbbd2..0fecfc221 100644 --- a/vp8/encoder/segmentation.h +++ b/vp8/encoder/segmentation.h @@ -19,8 +19,8 @@ extern "C" { #endif -extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, - MACROBLOCK *x); +extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, + MACROBLOCK *x); #ifdef __cplusplus } // extern "C" diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c index 5b8955510..8e5e31824 100644 --- a/vp8/encoder/vp8_quantize.c +++ b/vp8/encoder/vp8_quantize.c @@ -294,7 +294,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) { /* Select the baseline MB Q index. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q] [xd->mode_info_context->mbmi.segment_id]; /* Delta Value */ diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 340f3e663..a6f0b4cbc 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -19,6 +19,9 @@ #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" #include "vpx_util/vpx_timestamp.h" +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" +#endif #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" @@ -488,6 +491,9 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ctx->cfg = *cfg; set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); +#if CONFIG_MULTITHREAD + if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR; +#endif ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_OK; } @@ -618,10 +624,11 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, va_list args) { VP8_COMP *cpi = ctx->cpi; - const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args); if (data) { cpi->cyclic_refresh_mode_enabled = 0; cpi->rt_always_update_correction_factor = 1; + cpi->rt_drop_recode_on_overshoot = 0; } return VPX_CODEC_OK; } @@ -911,12 +918,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } - if (setjmp(ctx->cpi->common.error.jmp)) { - ctx->cpi->common.error.setjmp = 0; - vpx_clear_system_state(); - return VPX_CODEC_CORRUPT_FRAME; - } - /* Initialize the encoder instance on the first frame*/ if (!res && ctx->cpi) { unsigned int lib_flags; @@ -927,6 +928,13 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, unsigned char *cx_data_end; int comp_data_state = 0; + if (setjmp(ctx->cpi->common.error.jmp)) { + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + return VPX_CODEC_CORRUPT_FRAME; + } + ctx->cpi->common.error.setjmp = 1; + /* Set up internal flags */ if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) { ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1; @@ -962,8 +970,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, cx_data_end = ctx->cx_data + cx_data_sz; lib_flags = 0; - ctx->cpi->common.error.setjmp = 1; - while (cx_data_sz >= ctx->cx_data_sz / 2) { comp_data_state = vp8_get_compressed_data( ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp, @@ -1059,6 +1065,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } } + ctx->cpi->common.error.setjmp = 0; } return res; @@ -1224,8 +1231,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); + res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, + scalemode.v_scaling_mode); if (!res) { /*force next frame a key frame to effect scaling mode */ @@ -1292,8 +1299,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 0, /* rc_resize_allowed */ 1, /* rc_scaled_width */ 1, /* rc_scaled_height */ - 60, /* rc_resize_down_thresold */ - 30, /* rc_resize_up_thresold */ + 60, /* rc_resize_down_thresh */ + 30, /* rc_resize_up_thresh */ VPX_VBR, /* rc_end_usage */ { NULL, 0 }, /* rc_twopass_stats_in */ diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 55a77ba7e..2e5d6dcfe 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -20,6 +20,7 @@ #include "vpx_version.h" #include "common/alloccommon.h" #include "common/common.h" +#include "common/onyxc_int.h" #include "common/onyxd.h" #include "decoder/onyxd_int.h" #include "vpx_dsp/vpx_dsp_common.h" @@ -162,7 +163,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; /*printf("w=%d, h=%d\n", si->w, si->h);*/ - if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME; + if (!(si->h && si->w)) { + si->w = si->h = 0; + res = VPX_CODEC_CORRUPT_FRAME; + } } else { res = VPX_CODEC_UNSUP_BITSTREAM; } @@ -246,14 +250,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, /* Store a pointer to this fragment and return. We haven't * received the complete frame yet, so we will wait with decoding. */ - ctx->fragments.ptrs[ctx->fragments.count] = data; - ctx->fragments.sizes[ctx->fragments.count] = data_sz; - ctx->fragments.count++; - if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) { + if (ctx->fragments.count >= MAX_PARTITIONS) { ctx->fragments.count = 0; *res = VPX_CODEC_INVALID_PARAM; return -1; } + ctx->fragments.ptrs[ctx->fragments.count] = data; + ctx->fragments.sizes[ctx->fragments.count] = data_sz; + ctx->fragments.count++; return 0; } @@ -301,17 +305,26 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM; + if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 && + ctx->si.w == 0) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + assert(pbi != NULL); + assert(!pbi->common.error.setjmp); + res = VPX_CODEC_CORRUPT_FRAME; + vpx_internal_error(&pbi->common.error, res, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; #if CONFIG_MULTITHREAD if (!res && ctx->restart_threads) { - struct frame_buffers *fb = &ctx->yv12_frame_buffers; VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; VP8_COMMON *const pc = &pbi->common; if (setjmp(pbi->common.error.jmp)) { - vp8_remove_decoder_instances(fb); - vp8_zero(fb->pbi); + pbi->common.error.setjmp = 0; + vp8_decoder_remove_threads(pbi); vpx_clear_system_state(); return VPX_CODEC_ERROR; } @@ -349,7 +362,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); - if (res == VPX_CODEC_OK) ctx->decoder_init = 1; + if (res == VPX_CODEC_OK) { + ctx->decoder_init = 1; + } else { + /* on failure clear the cached resolution to ensure a full + * reallocation is attempted on resync. */ + ctx->si.w = 0; + ctx->si.h = 0; + } } /* Set these even if already initialized. The caller may have changed the @@ -494,6 +514,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, /* get ready for the next series of fragments */ ctx->fragments.count = 0; + pbi->common.error.setjmp = 0; } return res; diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index f3f42529d..dd3c8e623 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -10,7 +10,9 @@ #include <math.h> #include <new> +#include "vp8/common/common.h" #include "vp8/vp8_ratectrl_rtc.h" +#include "vp8/encoder/onyx_int.h" #include "vp8/encoder/ratectrl.h" #include "vpx_ports/system_state.h" @@ -60,12 +62,19 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create( if (!rc_api->cpi_) return nullptr; vp8_zero(*rc_api->cpi_); - rc_api->InitRateControl(cfg); + if (!rc_api->InitRateControl(cfg)) return nullptr; return rc_api; } -void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { +VP8RateControlRTC::~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } +} + +bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; oxcf->end_usage = USAGE_STREAM_FROM_SERVER; @@ -83,13 +92,19 @@ void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { cpi_->kf_bitrate_adjustment = 0; cpi_->gf_overspend_bits = 0; cpi_->non_gf_bitrate_adjustment = 0; - UpdateRateControl(rc_cfg); + if (!UpdateRateControl(rc_cfg)) return false; cpi_->buffer_level = oxcf->starting_buffer_level; cpi_->bits_off_target = oxcf->starting_buffer_level; + return true; } -void VP8RateControlRTC::UpdateRateControl( +bool VP8RateControlRTC::UpdateRateControl( const VP8RateControlRtcConfig &rc_cfg) { + if (rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) { + return false; + } + VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; const unsigned int prev_number_of_layers = oxcf->number_of_layers; @@ -118,6 +133,8 @@ void VP8RateControlRTC::UpdateRateControl( cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1; cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; cpi_->framerate = rc_cfg.framerate; for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) { @@ -190,9 +207,11 @@ void VP8RateControlRTC::UpdateRateControl( vp8_new_framerate(cpi_, cpi_->framerate); vpx_clear_system_state(); + return true; } -void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { +FrameDropDecision VP8RateControlRTC::ComputeQP( + const VP8FrameParamsQpRTC &frame_params) { VP8_COMMON *const cm = &cpi_->common; vpx_clear_system_state(); if (cpi_->oxcf.number_of_layers > 1) { @@ -203,14 +222,27 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { vp8_restore_layer_context(cpi_, layer); vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); } - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type); cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { cpi_->common.frame_flags |= FRAMEFLAGS_KEY; } - vp8_pick_frame_size(cpi_); + cpi_->per_frame_bandwidth = static_cast<int>( + round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate)); + if (vp8_check_drop_buffer(cpi_)) { + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (!vp8_pick_frame_size(cpi_)) { + cm->current_video_frame++; + cpi_->frames_since_key++; + cpi_->ext_refresh_frame_flags_pending = 0; + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level && cpi_->buffered_mode) { @@ -274,10 +306,39 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); vp8_set_quantizer(cpi_, q_); vpx_clear_system_state(); + return FrameDropDecision::kOk; } int VP8RateControlRTC::GetQP() const { return q_; } +int VP8RateControlRTC::GetLoopfilterLevel() const { + VP8_COMMON *cm = &cpi_->common; + const double qp = q_; + + // This model is from linear regression + if (cm->Width * cm->Height <= 320 * 240) { + cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774); + } else if (cm->Width * cm->Height <= 640 * 480) { + cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462); + } else { + cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003); + } + + int min_filter_level = 0; + // This logic is from get_min_filter_level() in picklpf.c + if (q_ > 6 && q_ <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (q_ / 8); + } + + const int max_filter_level = 63; + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + return cm->filter_level; +} + void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { VP8_COMMON *const cm = &cpi_->common; vpx_clear_system_state(); diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index def7dd8f9..59fb60752 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -12,23 +12,24 @@ #define VPX_VP8_RATECTRL_RTC_H_ #include <cstdint> +#include <cstring> #include <memory> -#include "vp8/encoder/onyx_int.h" -#include "vp8/common/common.h" #include "vpx/internal/vpx_ratectrl_rtc.h" +struct VP8_COMP; + namespace libvpx { struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP8RateControlRtcConfig() { - vp8_zero(layer_target_bitrate); - vp8_zero(ts_rate_decimator); + memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); } }; struct VP8FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int temporal_layer_id; }; @@ -36,25 +37,25 @@ class VP8RateControlRTC { public: static std::unique_ptr<VP8RateControlRTC> Create( const VP8RateControlRtcConfig &cfg); - ~VP8RateControlRTC() { - if (cpi_) { - vpx_free(cpi_->gf_active_flags); - vpx_free(cpi_); - } - } + ~VP8RateControlRTC(); - void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); + bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; - // int GetLoopfilterLevel() const; - void ComputeQP(const VP8FrameParamsQpRTC &frame_params); + // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter + // level is calculated from frame qp. + int GetLoopfilterLevel() const; + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called. + FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); private: VP8RateControlRTC() {} - void InitRateControl(const VP8RateControlRtcConfig &cfg); - VP8_COMP *cpi_; + bool InitRateControl(const VP8RateControlRtcConfig &cfg); + struct VP8_COMP *cpi_; int q_; }; diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c index 219ff63cb..b43d7fa4f 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -18,7 +18,7 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/inv_txfm.h" -// Use macros to make sure argument lane is passed in as an constant integer. +// Use macros to make sure argument lane is passed in as a constant integer. #define vmull_lane_s32_dual(in, c, lane, out) \ do { \ @@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { #define highbd_iadst_half_butterfly(in, c, lane, out) \ do { \ - int64x2x2_t t[2]; \ - vmull_lane_s32_dual(in, c, lane, t); \ - out = highbd_dct_const_round_shift_low_8(t); \ + int64x2x2_t _t[2]; \ + vmull_lane_s32_dual(in, c, lane, _t); \ + out = highbd_dct_const_round_shift_low_8(_t); \ } while (0) #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index d7de46cf4..aa13d8a0d 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -54,7 +54,7 @@ typedef struct { // decoder implementation modules critically rely on the defined entry values // specified herein. They should be refactored concurrently. -#define NONE (-1) +#define NO_REF_FRAME (-1) #define INTRA_FRAME 0 #define LAST_FRAME 1 #define GOLDEN_FRAME 2 diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 8d2bed38e..d63bad93d 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -46,27 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - assert(&(cm)->error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - assert(&(cm)->error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #define VP9_SYNC_CODE_0 0x49 #define VP9_SYNC_CODE_1 0x83 #define VP9_SYNC_CODE_2 0x42 diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index bda824de3..9289fc9e1 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } if (cm->tx_mode == TX_MODE_SELECT) { - int j; unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 69069042c..71be0f310 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob == 1) /* DC only DCT coefficient. */ @@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); if (eob == 1) vpx_idct32x32_1_add(input, dest, stride); else if (eob <= 34) diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 765cb1172..1a9d45ae7 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, break; default: for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { - const int shift_y = shift_32_y[idx_32]; - const int shift_uv = shift_32_uv[idx_32]; + const int shift_y_32 = shift_32_y[idx_32]; + const int shift_uv_32 = shift_32_uv[idx_32]; const int mi_32_col_offset = ((idx_32 & 1) << 2); const int mi_32_row_offset = ((idx_32 >> 1) << 2); if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) continue; switch (mip[0]->sb_type) { case BLOCK_32X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); break; case BLOCK_32X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_row_offset + 2 >= max_rows) continue; mip2 = mip + mode_info_stride * 2; - build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm); break; case BLOCK_16X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_col_offset + 2 >= max_cols) continue; mip2 = mip + 2; - build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm); break; default: for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { - const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16]; - const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16]; + const int shift_y_16 = shift_y_32 + shift_16_y[idx_16]; + const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16]; const int mi_16_col_offset = mi_32_col_offset + ((idx_16 & 1) << 1); const int mi_16_row_offset = @@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, switch (mip[0]->sb_type) { case BLOCK_16X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); break; case BLOCK_16X8: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_row_offset + 1 >= max_rows) continue; mip2 = mip + mode_info_stride; - build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm); break; case BLOCK_8X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_col_offset + 1 >= max_cols) continue; mip2 = mip + 1; - build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm); break; default: { - const int shift_y = - shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0]; - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + const int shift_y_8_0 = shift_y_16 + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm); mip += offset[0]; for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { - const int shift_y = shift_32_y[idx_32] + - shift_16_y[idx_16] + shift_8_y[idx_8]; + const int shift_y_8 = shift_y_16 + shift_8_y[idx_8]; const int mi_8_col_offset = mi_16_col_offset + ((idx_8 & 1)); const int mi_8_row_offset = @@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, if (mi_8_col_offset >= max_cols || mi_8_row_offset >= max_rows) continue; - build_y_mask(lfi_n, mip[0], shift_y, lfm); + build_y_mask(lfi_n, mip[0], shift_y_8, lfm); } break; } diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index e76d771b8..cf60fa40f 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, const int bsl = b_width_log2_lookup[bs]; PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; const BLOCK_SIZE subsize = get_subsize(bs, partition); + BLOCK_SIZE mfqe_bs, bs_tmp; if (cur_bs < BLOCK_8X8) { // If there are blocks smaller than 8x8, it must be on the boundary. @@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, uv_offset = 8; } switch (partition) { - BLOCK_SIZE mfqe_bs, bs_tmp; case PARTITION_HORZ: if (bs == BLOCK_64X64) { mfqe_bs = BLOCK_64X32; diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index ff59ff504..4878dc15e 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -158,18 +158,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // Co-ordinate of containing block to pixel precision. const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer, + ref_buf->v_buffer }; + const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride, + ref_buf->uv_stride }; #if 0 // CONFIG_BETTER_HW_COMPATIBILITY assert(xd->mi[0]->sb_type != BLOCK_4X8 && xd->mi[0]->sb_type != BLOCK_8X4); assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); #endif - if (plane == 0) - pre_buf->buf = xd->block_refs[ref]->buf->y_buffer; - else if (plane == 1) - pre_buf->buf = xd->block_refs[ref]->buf->u_buffer; - else - pre_buf->buf = xd->block_refs[ref]->buf->v_buffer; + pre_buf->buf = buf_array[plane]; + pre_buf->stride = stride_array[plane]; pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f4bd9772c..3ecbd5417 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -23,7 +23,9 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -127,24 +129,21 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; +specialize qw/vp9_block_error_fp neon avx2 sse2/; -add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; -add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - specialize qw/vp9_block_error avx2 sse2/; - - specialize qw/vp9_block_error_fp avx2 sse2/; + specialize qw/vp9_block_error neon avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/vp9_highbd_block_error sse2/; + specialize qw/vp9_highbd_block_error neon sse2/; } else { - specialize qw/vp9_block_error avx2 msa sse2/; - - specialize qw/vp9_block_error_fp neon avx2 sse2/; + specialize qw/vp9_block_error neon avx2 msa sse2/; } # fdct functions @@ -174,19 +173,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # # Motion search # -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx neon/; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv"; +specialize qw/vp9_diamond_search_sad neon/; # # Apply temporal filter # if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; -specialize qw/vp9_apply_temporal_filter sse4_1/; +specialize qw/vp9_apply_temporal_filter sse4_1 neon/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count"; - specialize qw/vp9_highbd_apply_temporal_filter sse4_1/; + specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/; } } @@ -195,10 +194,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_highbd_quantize_fp avx2 neon/; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions @@ -206,8 +205,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_fht4x4 neon/; add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht8x8 neon/; add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht16x16 neon/; add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c index 8bea61dea..adacb7ef9 100644 --- a/vp9/common/vp9_scan.c +++ b/vp9/common/vp9_scan.c @@ -688,14 +688,14 @@ DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, }; -const scan_order vp9_default_scan_orders[TX_SIZES] = { +const ScanOrder vp9_default_scan_orders[TX_SIZES] = { { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors }, { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors }, { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors }, }; -const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = { +const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = { { // TX_4X4 { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors }, diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index 72a9a5ec4..3d1dcc66d 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -23,14 +23,14 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef struct { +typedef struct ScanOrder { const int16_t *scan; const int16_t *iscan; const int16_t *neighbors; -} scan_order; +} ScanOrder; -extern const scan_order vp9_default_scan_orders[TX_SIZES]; -extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES]; +extern const ScanOrder vp9_default_scan_orders[TX_SIZES]; +extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES]; static INLINE int get_coef_context(const int16_t *neighbors, const uint8_t *token_cache, int c) { @@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors, 1; } -static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { +static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi[0]; if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) { diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index b3d50162b..8df18af3b 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, { int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->mutex, vpx_malloc(sizeof(*lf_sync->mutex) * rows)); if (lf_sync->mutex) { for (i = 0; i < rows; ++i) { @@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } } - CHECK_MEM_ERROR(cm, lf_sync->cond, + CHECK_MEM_ERROR(&cm->error, lf_sync->cond, vpx_malloc(sizeof(*lf_sync->cond) * rows)); if (lf_sync->cond) { for (i = 0; i < rows; ++i) { @@ -299,23 +299,21 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } } - CHECK_MEM_ERROR(cm, lf_sync->lf_mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex, vpx_malloc(sizeof(*lf_sync->lf_mutex))); pthread_mutex_init(lf_sync->lf_mutex, NULL); - CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex, vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); if (lf_sync->recon_done_mutex) { - int i; for (i = 0; i < rows; ++i) { pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond, vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); if (lf_sync->recon_done_cond) { - int i; for (i = 0; i < rows; ++i) { pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); } @@ -323,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } #endif // CONFIG_MULTITHREAD - CHECK_MEM_ERROR(cm, lf_sync->lfdata, + CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata, vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); lf_sync->num_workers = num_workers; lf_sync->num_active_workers = lf_sync->num_workers; - CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, + CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); - CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done, vpx_malloc(sizeof(*lf_sync->num_tiles_done) * mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2)); @@ -390,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { int return_val = -1; - int cur_row; const int max_rows = cm->mi_rows; #if CONFIG_MULTITHREAD + int cur_row; const int tile_cols = 1 << cm->log2_tile_cols; pthread_mutex_lock(lf_sync->lf_mutex); @@ -430,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { #else (void)lf_sync; if (cm->lf_row < max_rows) { - cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; return_val = cm->lf_row; cm->lf_row += MI_BLOCK_SIZE; - if (cm->lf_row < max_rows) { - /* If this is not the last row, make sure the next row is also decoded. - * This is because the intra predict has to happen before loop filter */ - cur_row += 1; - } } #endif // CONFIG_MULTITHREAD diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 2a27e6fdb..c5892156f 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, if (!mi->skip) { const TX_TYPE tx_type = (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; - const scan_order *sc = (plane || xd->lossless) - ? &vp9_default_scan_orders[tx_size] - : &vp9_scan_orders[tx_size][tx_type]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); if (eob > 0) { @@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_TYPE tx_type = (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; - const scan_order *sc = (plane || xd->lossless) - ? &vp9_default_scan_orders[tx_size] - : &vp9_scan_orders[tx_size][tx_type]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); /* Keep the alignment to 16 */ @@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, int mi_row, int mi_col) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; @@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, TX_SIZE tx_size) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); @@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) { vpx_free(cm->cur_frame->mvs); cm->cur_frame->mi_rows = cm->mi_rows; cm->cur_frame->mi_cols = cm->mi_cols; - CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs, (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cm->cur_frame->mvs))); } @@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) { if (jobq_size > row_mt_worker_data->jobq_size) { vpx_free(row_mt_worker_data->jobq_buf); - CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size)); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf, + vpx_calloc(1, jobq_size)); vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, jobq_size); row_mt_worker_data->jobq_size = jobq_size; @@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) { const int is_last_row = sb_rows - 1 == cur_sb_row; int mi_col_start, mi_col_end; if (!tile_data_recon) - CHECK_MEM_ERROR(cm, tile_data_recon, + CHECK_MEM_ERROR(&cm->error, tile_data_recon, vpx_memalign(32, sizeof(TileWorkerData))); tile_data_recon->xd = pbi->mb; @@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, if (cm->lf.filter_level && !cm->skip_loop_filter && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); pbi->lf_worker.hook = vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { @@ -2192,8 +2193,6 @@ static int tile_worker_hook(void *arg1, void *arg2) { volatile int mi_row = 0; volatile int n = tile_data->buf_start; - tile_data->error_info.setjmp = 1; - if (setjmp(tile_data->error_info.jmp)) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; @@ -2206,6 +2205,7 @@ static int tile_worker_hook(void *arg1, void *arg2) { } return 0; } + tile_data->error_info.setjmp = 1; tile_data->xd.corrupted = 0; @@ -2285,7 +2285,7 @@ static INLINE void init_mt(VP9Decoder *pbi) { if (pbi->num_tile_workers == 0) { const int num_threads = pbi->max_threads; - CHECK_MEM_ERROR(cm, pbi->tile_workers, + CHECK_MEM_ERROR(&cm->error, pbi->tile_workers, vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); for (n = 0; n < num_threads; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; @@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) { winterface->init(worker); if (n < num_threads - 1 && !winterface->reset(worker)) { + do { + winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); + } while (--pbi->num_tile_workers != 0); + vpx_free(pbi->tile_workers); + pbi->tile_workers = NULL; vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); } @@ -2824,7 +2829,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, const int num_jobs = sb_rows << cm->log2_tile_cols; if (pbi->row_mt_worker_data == NULL) { - CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data, + CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data, vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); #if CONFIG_MULTITHREAD pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); @@ -3006,7 +3011,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, // platforms without DECLARE_ALIGNED(). assert((sizeof(*pbi->tile_worker_data) % 16) == 0); vpx_free(pbi->tile_worker_data); - CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size)); + CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data, + vpx_memalign(32, twd_size)); pbi->total_tiles = tile_rows * tile_cols; } diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index db3e74663..0989cde58 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm, mi->skip = read_skip(cm, xd, mi->segment_id, r); mi->tx_size = read_tx_size(cm, xd, 1, r); mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; switch (bsize) { case BLOCK_4X4: @@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, } } -// Read the referncence frame +// Read the reference frame static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, vpx_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { @@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding @@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[0] = LAST_FRAME; } - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { assert(0 && "Invalid prediction mode."); } @@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, mi->interp_filter = SWITCHABLE_FILTERS; mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; } static INLINE int is_mv_valid(const MV *mv) { @@ -708,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, mi->mode = ZEROMV; if (bsize < BLOCK_8X8) { vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid usage of segement feature on small blocks"); + "Invalid usage of segment feature on small blocks"); return; } } else { diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 7db8ed72d..5a7e9f9ab 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, { int i; CHECK_MEM_ERROR( - cm, row_mt_worker_data->recon_sync_mutex, + &cm->error, row_mt_worker_data->recon_sync_mutex, vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); if (row_mt_worker_data->recon_sync_mutex) { for (i = 0; i < num_jobs; ++i) { @@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, } CHECK_MEM_ERROR( - cm, row_mt_worker_data->recon_sync_cond, + &cm->error, row_mt_worker_data->recon_sync_cond, vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); if (row_mt_worker_data->recon_sync_cond) { for (i = 0; i < num_jobs; ++i) { @@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, #endif row_mt_worker_data->num_sbs = num_sbs; for (plane = 0; plane < 3; ++plane) { - CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], - vpx_memalign(16, dqcoeff_size)); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane], + vpx_memalign(32, dqcoeff_size)); memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); - CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane], vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, sizeof(*row_mt_worker_data->eob[plane]))); } - CHECK_MEM_ERROR(cm, row_mt_worker_data->partition, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition, vpx_calloc(num_sbs * PARTITIONS_PER_SB, sizeof(*row_mt_worker_data->partition))); - CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map, vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); // allocate memory for thread_data if (row_mt_worker_data->thread_data == NULL) { const size_t thread_size = max_threads * sizeof(*row_mt_worker_data->thread_data); - CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data, vpx_memalign(32, thread_size)); } } @@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 1; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); pbi->need_resync = 1; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index b0ef83c73..2e198d552 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -54,7 +54,7 @@ typedef struct TileWorkerData { VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]); struct vpx_internal_error_info error_info; } TileWorkerData; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 3ed1bd6ff..d957dc34e 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -272,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, } } -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id) { +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id) { vpx_reader *r = &twd->bit_reader; MACROBLOCKD *xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index a32052fff..a8e47021b 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -19,9 +19,8 @@ extern "C" { #endif -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id); +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 5961be5f3..997b5477e 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -20,6 +20,7 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/fdct4x4_neon.h" #include "vpx_dsp/arm/fdct8x8_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { @@ -1228,4 +1229,945 @@ void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, } } +static INLINE void highbd_load_buffer_8x8(const int16_t *input, + int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/, int stride) { + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi, + const int bit) { + int32x4_t sign_lo[8], sign_hi[8]; + sign_lo[0] = vshrq_n_s32(lo[0], 31); + sign_hi[0] = vshrq_n_s32(hi[0], 31); + sign_lo[1] = vshrq_n_s32(lo[1], 31); + sign_hi[1] = vshrq_n_s32(hi[1], 31); + sign_lo[2] = vshrq_n_s32(lo[2], 31); + sign_hi[2] = vshrq_n_s32(hi[2], 31); + sign_lo[3] = vshrq_n_s32(lo[3], 31); + sign_hi[3] = vshrq_n_s32(hi[3], 31); + sign_lo[4] = vshrq_n_s32(lo[4], 31); + sign_hi[4] = vshrq_n_s32(hi[4], 31); + sign_lo[5] = vshrq_n_s32(lo[5], 31); + sign_hi[5] = vshrq_n_s32(hi[5], 31); + sign_lo[6] = vshrq_n_s32(lo[6], 31); + sign_hi[6] = vshrq_n_s32(hi[6], 31); + sign_lo[7] = vshrq_n_s32(lo[7], 31); + sign_hi[7] = vshrq_n_s32(hi[7], 31); + + if (bit == 2) { + const int32x4_t const_rounding = vdupq_n_s32(1); + lo[0] = vaddq_s32(lo[0], const_rounding); + hi[0] = vaddq_s32(hi[0], const_rounding); + lo[1] = vaddq_s32(lo[1], const_rounding); + hi[1] = vaddq_s32(hi[1], const_rounding); + lo[2] = vaddq_s32(lo[2], const_rounding); + hi[2] = vaddq_s32(hi[2], const_rounding); + lo[3] = vaddq_s32(lo[3], const_rounding); + hi[3] = vaddq_s32(hi[3], const_rounding); + lo[4] = vaddq_s32(lo[4], const_rounding); + hi[4] = vaddq_s32(hi[4], const_rounding); + lo[5] = vaddq_s32(lo[5], const_rounding); + hi[5] = vaddq_s32(hi[5], const_rounding); + lo[6] = vaddq_s32(lo[6], const_rounding); + hi[6] = vaddq_s32(hi[6], const_rounding); + lo[7] = vaddq_s32(lo[7], const_rounding); + hi[7] = vaddq_s32(hi[7], const_rounding); + } + + lo[0] = vsubq_s32(lo[0], sign_lo[0]); + hi[0] = vsubq_s32(hi[0], sign_hi[0]); + lo[1] = vsubq_s32(lo[1], sign_lo[1]); + hi[1] = vsubq_s32(hi[1], sign_hi[1]); + lo[2] = vsubq_s32(lo[2], sign_lo[2]); + hi[2] = vsubq_s32(hi[2], sign_hi[2]); + lo[3] = vsubq_s32(lo[3], sign_lo[3]); + hi[3] = vsubq_s32(hi[3], sign_hi[3]); + lo[4] = vsubq_s32(lo[4], sign_lo[4]); + hi[4] = vsubq_s32(hi[4], sign_hi[4]); + lo[5] = vsubq_s32(lo[5], sign_lo[5]); + hi[5] = vsubq_s32(hi[5], sign_hi[5]); + lo[6] = vsubq_s32(lo[6], sign_lo[6]); + hi[6] = vsubq_s32(hi[6], sign_hi[6]); + lo[7] = vsubq_s32(lo[7], sign_lo[7]); + hi[7] = vsubq_s32(hi[7], sign_hi[7]); + + if (bit == 1) { + lo[0] = vshrq_n_s32(lo[0], 1); + hi[0] = vshrq_n_s32(hi[0], 1); + lo[1] = vshrq_n_s32(lo[1], 1); + hi[1] = vshrq_n_s32(hi[1], 1); + lo[2] = vshrq_n_s32(lo[2], 1); + hi[2] = vshrq_n_s32(hi[2], 1); + lo[3] = vshrq_n_s32(lo[3], 1); + hi[3] = vshrq_n_s32(hi[3], 1); + lo[4] = vshrq_n_s32(lo[4], 1); + hi[4] = vshrq_n_s32(hi[4], 1); + lo[5] = vshrq_n_s32(lo[5], 1); + hi[5] = vshrq_n_s32(hi[5], 1); + lo[6] = vshrq_n_s32(lo[6], 1); + hi[6] = vshrq_n_s32(hi[6], 1); + lo[7] = vshrq_n_s32(lo[7], 1); + hi[7] = vshrq_n_s32(hi[7], 1); + } else { + lo[0] = vshrq_n_s32(lo[0], 2); + hi[0] = vshrq_n_s32(hi[0], 2); + lo[1] = vshrq_n_s32(lo[1], 2); + hi[1] = vshrq_n_s32(hi[1], 2); + lo[2] = vshrq_n_s32(lo[2], 2); + hi[2] = vshrq_n_s32(hi[2], 2); + lo[3] = vshrq_n_s32(lo[3], 2); + hi[3] = vshrq_n_s32(hi[3], 2); + lo[4] = vshrq_n_s32(lo[4], 2); + hi[4] = vshrq_n_s32(hi[4], 2); + lo[5] = vshrq_n_s32(lo[5], 2); + hi[5] = vshrq_n_s32(hi[5], 2); + lo[6] = vshrq_n_s32(lo[6], 2); + hi[6] = vshrq_n_s32(hi[6], 2); + lo[7] = vshrq_n_s32(lo[7], 2); + hi[7] = vshrq_n_s32(hi[7], 2); + } +} + +static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo, + int32x4_t *hi, int stride) { + vst1q_s32(output + 0 * stride, lo[0]); + vst1q_s32(output + 0 * stride + 4, hi[0]); + vst1q_s32(output + 1 * stride, lo[1]); + vst1q_s32(output + 1 * stride + 4, hi[1]); + vst1q_s32(output + 2 * stride, lo[2]); + vst1q_s32(output + 2 * stride + 4, hi[2]); + vst1q_s32(output + 3 * stride, lo[3]); + vst1q_s32(output + 3 * stride + 4, hi[3]); + vst1q_s32(output + 4 * stride, lo[4]); + vst1q_s32(output + 4 * stride + 4, hi[4]); + vst1q_s32(output + 5 * stride, lo[5]); + vst1q_s32(output + 5 * stride + 4, hi[5]); + vst1q_s32(output + 6 * stride, lo[6]); + vst1q_s32(output + 6 * stride + 4, hi[6]); + vst1q_s32(output + 7 * stride, lo[7]); + vst1q_s32(output + 7 * stride + 4, hi[7]); +} + +static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/) { + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + int32x4_t x_lo[8], x_hi[8]; + int64x2_t s64_lo[16], s64_hi[16]; + + x_lo[0] = lo[7]; + x_hi[0] = hi[7]; + x_lo[1] = lo[0]; + x_hi[1] = hi[0]; + x_lo[2] = lo[5]; + x_hi[2] = hi[5]; + x_lo[3] = lo[2]; + x_hi[3] = hi[2]; + x_lo[4] = lo[3]; + x_hi[4] = hi[3]; + x_lo[5] = lo[4]; + x_hi[5] = hi[4]; + x_lo[6] = lo[1]; + x_hi[6] = hi[1]; + x_lo[7] = lo[6]; + x_hi[7] = hi[6]; + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_s64_noround( + t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + + // stage 3 + // s2 = cospi_16_64 * (x2 + x3) + // s3 = cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64, + &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]); + + // s6 = cospi_16_64 * (x6 + x7) + // s7 = cospi_16_64 * (x6 - x7) + butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64, + &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]); + + // x0, x2, x4, x6 pass through + lo[0] = t_lo[0]; + hi[0] = t_hi[0]; + lo[2] = s_lo[6]; + hi[2] = s_hi[6]; + lo[4] = s_lo[3]; + hi[4] = s_hi[3]; + lo[6] = t_lo[5]; + hi[6] = t_hi[5]; + + lo[1] = vnegq_s32(t_lo[4]); + hi[1] = vnegq_s32(t_hi[4]); + lo[3] = vnegq_s32(s_lo[2]); + hi[3] = vnegq_s32(s_hi[2]); + lo[5] = vnegq_s32(s_lo[7]); + hi[5] = vnegq_s32(s_hi[7]); + lo[7] = vnegq_s32(t_lo[1]); + hi[7] = vnegq_s32(t_hi[1]); + + transpose_s32_8x8_2(lo, hi, lo, hi); +} + +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t lo[8], hi[8]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + case DCT_ADST: + highbd_load_buffer_8x8(input, lo, hi, stride); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + } +} + +static INLINE void highbd_load_buffer_16x16( + const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // load first 8 columns + highbd_load_buffer_8x8(input, left1, right1, stride); + highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride); + + input += 8; + // load second 8 columns + highbd_load_buffer_8x8(input, left2, right2, stride); + highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_write_buffer_16x16( + tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // write first 8 columns + highbd_write_buffer_8x8(output, left1, right1, stride); + highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride); + + // write second 8 columns + output += 8; + highbd_write_buffer_8x8(output, left2, right2, stride); + highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/, + int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, + int32x4_t *right2 /*[16]*/, + const int bit) { + // perform rounding operations + highbd_right_shift_8x8(left1, right1, bit); + highbd_right_shift_8x8(left1 + 8, right1 + 8, bit); + highbd_right_shift_8x8(left2, right2, bit); + highbd_right_shift_8x8(left2 + 8, right2 + 8, bit); +} + +static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D DCT for 8 columns + int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8]; + int32x4_t left8[8], right8[8]; + + // stage 1 + left8[0] = vaddq_s32(left[0], left[15]); + right8[0] = vaddq_s32(right[0], right[15]); + left8[1] = vaddq_s32(left[1], left[14]); + right8[1] = vaddq_s32(right[1], right[14]); + left8[2] = vaddq_s32(left[2], left[13]); + right8[2] = vaddq_s32(right[2], right[13]); + left8[3] = vaddq_s32(left[3], left[12]); + right8[3] = vaddq_s32(right[3], right[12]); + left8[4] = vaddq_s32(left[4], left[11]); + right8[4] = vaddq_s32(right[4], right[11]); + left8[5] = vaddq_s32(left[5], left[10]); + right8[5] = vaddq_s32(right[5], right[10]); + left8[6] = vaddq_s32(left[6], left[9]); + right8[6] = vaddq_s32(right[6], right[9]); + left8[7] = vaddq_s32(left[7], left[8]); + right8[7] = vaddq_s32(right[7], right[8]); + + // step 1 + s1_lo[0] = vsubq_s32(left[7], left[8]); + s1_hi[0] = vsubq_s32(right[7], right[8]); + s1_lo[1] = vsubq_s32(left[6], left[9]); + s1_hi[1] = vsubq_s32(right[6], right[9]); + s1_lo[2] = vsubq_s32(left[5], left[10]); + s1_hi[2] = vsubq_s32(right[5], right[10]); + s1_lo[3] = vsubq_s32(left[4], left[11]); + s1_hi[3] = vsubq_s32(right[4], right[11]); + s1_lo[4] = vsubq_s32(left[3], left[12]); + s1_hi[4] = vsubq_s32(right[3], right[12]); + s1_lo[5] = vsubq_s32(left[2], left[13]); + s1_hi[5] = vsubq_s32(right[2], right[13]); + s1_lo[6] = vsubq_s32(left[1], left[14]); + s1_hi[6] = vsubq_s32(right[1], right[14]); + s1_lo[7] = vsubq_s32(left[0], left[15]); + s1_hi[7] = vsubq_s32(right[0], right[15]); + + // pass1 variant is not accurate enough + vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8); + + // step 2 + // step2[2] = (step1[5] - step1[2]) * cospi_16_64; + // step2[5] = (step1[5] + step1[2]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_16_64, &s2_lo[5], &s2_hi[5], + &s2_lo[2], &s2_hi[2]); + // step2[3] = (step1[4] - step1[3]) * cospi_16_64; + // step2[4] = (step1[4] + step1[3]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_16_64, &s2_lo[4], &s2_hi[4], + &s2_lo[3], &s2_hi[3]); + + // step 3 + s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]); + s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]); + s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]); + s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]); + s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]); + s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]); + s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]); + s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]); + s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]); + s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]); + s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]); + s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]); + s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]); + s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]); + s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]); + s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]); + + // step 4 + // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1] + // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1] + butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1], + cospi_8_64, cospi_24_64, &s2_lo[6], + &s2_hi[6], &s2_lo[1], &s2_hi[1]); + + // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5] + // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5] + butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5], + cospi_24_64, cospi_8_64, &s2_lo[2], + &s2_hi[2], &s2_lo[5], &s2_hi[5]); + + // step 5 + s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]); + s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]); + s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]); + s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]); + s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]); + s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]); + s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]); + s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]); + s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]); + s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]); + s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]); + s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]); + s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]); + s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]); + s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]); + s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]); + + // step 6 + // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + + // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + + // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); + + // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + + left[0] = left8[0]; + right[0] = right8[0]; + left[2] = left8[1]; + right[2] = right8[1]; + left[4] = left8[2]; + right[4] = right8[2]; + left[6] = left8[3]; + right[6] = right8[3]; + left[8] = left8[4]; + right[8] = right8[4]; + left[10] = left8[5]; + right[10] = right8[5]; + left[12] = left8[6]; + right[12] = right8[6]; + left[14] = left8[7]; + right[14] = right8[7]; +} + +static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D ADST for 8 columns + int32x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + int64x2_t s64_lo[32], s64_hi[32]; + + x_lo[0] = left[15]; + x_hi[0] = right[15]; + x_lo[1] = left[0]; + x_hi[1] = right[0]; + x_lo[2] = left[13]; + x_hi[2] = right[13]; + x_lo[3] = left[2]; + x_hi[3] = right[2]; + x_lo[4] = left[11]; + x_hi[4] = right[11]; + x_lo[5] = left[4]; + x_hi[5] = right[4]; + x_lo[6] = left[9]; + x_hi[6] = right[9]; + x_lo[7] = left[6]; + x_hi[7] = right[6]; + x_lo[8] = left[7]; + x_hi[8] = right[7]; + x_lo[9] = left[8]; + x_hi[9] = right[8]; + x_lo[10] = left[5]; + x_hi[10] = right[5]; + x_lo[11] = left[10]; + x_hi[11] = right[10]; + x_lo[12] = left[3]; + x_hi[12] = right[3]; + x_lo[13] = left[12]; + x_hi[13] = right[12]; + x_lo[14] = left[1]; + x_hi[14] = right[1]; + x_lo[15] = left[14]; + x_hi[15] = right[14]; + + // stage 1, indices are doubled + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s32_s64_noround( + x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s32_s64_noround( + x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s32_s64_noround( + x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s32_s64_noround( + x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64, + &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64, + &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s4 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]); + + // s0 - s4 + t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 - s5 + t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]); + + // fdct_round_shift() + // s8 + s12 + t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 + s13 + t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 + s14 + t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 + s15 + t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // s8 - s12 + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 - s13 + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64, + &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + // fdct_round_shift() + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s8 + s10 + t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]); + // fdct_round_shift() + // s12 + s14 + t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 + s15 + t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + // s12 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + + // stage 4, with fdct_round_shift + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3], + &x_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7], + &x_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &x_lo[10], &x_hi[10], + &x_lo[11], &x_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &x_lo[14], &x_hi[14], + &x_lo[15], &x_hi[15]); + + // Just copy x0, x1, x4, x5, x8, x9, x12, x13 + x_lo[0] = t_lo[0]; + x_hi[0] = t_hi[0]; + x_lo[1] = t_lo[1]; + x_hi[1] = t_hi[1]; + x_lo[4] = t_lo[4]; + x_hi[4] = t_hi[4]; + x_lo[5] = t_lo[5]; + x_hi[5] = t_hi[5]; + x_lo[8] = t_lo[8]; + x_hi[8] = t_hi[8]; + x_lo[9] = t_lo[9]; + x_hi[9] = t_hi[9]; + x_lo[12] = t_lo[12]; + x_hi[12] = t_hi[12]; + x_lo[13] = t_lo[13]; + x_hi[13] = t_hi[13]; + + left[0] = x_lo[0]; + right[0] = x_hi[0]; + left[1] = vnegq_s32(x_lo[8]); + right[1] = vnegq_s32(x_hi[8]); + left[2] = x_lo[12]; + right[2] = x_hi[12]; + left[3] = vnegq_s32(x_lo[4]); + right[3] = vnegq_s32(x_hi[4]); + left[4] = x_lo[6]; + right[4] = x_hi[6]; + left[5] = x_lo[14]; + right[5] = x_hi[14]; + left[6] = x_lo[10]; + right[6] = x_hi[10]; + left[7] = x_lo[2]; + right[7] = x_hi[2]; + left[8] = x_lo[3]; + right[8] = x_hi[3]; + left[9] = x_lo[11]; + right[9] = x_hi[11]; + left[10] = x_lo[15]; + right[10] = x_hi[15]; + left[11] = x_lo[7]; + right[11] = x_hi[7]; + left[12] = x_lo[5]; + right[12] = x_hi[5]; + left[13] = vnegq_s32(x_lo[13]); + right[13] = vnegq_s32(x_hi[13]); + left[14] = x_lo[9]; + right[14] = x_hi[9]; + left[15] = vnegq_s32(x_lo[1]); + right[15] = vnegq_s32(x_hi[1]); +} + +static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fdct16_8col(left1, right1); + // Right half. + highbd_fdct16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fadst16_8col(left1, right1); + // Right half. + highbd_fadst16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t left1[16], right1[16], left2[16], right2[16]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + case DCT_ADST: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + } +} + #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c index 53e8c7e49..d631cd437 100644 --- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c +++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -21,7 +21,7 @@ // Compute the sum of all pixel differences of this MB. static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s8(v_sum_diff_total); #else const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 33753f77b..b82b3f9db 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { return result; } -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); - assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -117,12 +94,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); #endif - unsigned int best_sad = INT_MAX; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { @@ -143,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, int8x16_t v_inside_d; uint32x4_t v_outside_d; int32x4_t v_cost_d, v_sad_d; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_blocka[2]; #else int32x4_t v_blocka[1]; @@ -164,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, vreinterpretq_s32_s16(v_these_mv_w))); // If none of them are inside, then move on -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); #else horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), @@ -193,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Compute the SIMD pointer offsets. { -#if defined(__aarch64__) // sizeof(intptr_t) == 8 +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 // Load the offsets int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); @@ -214,13 +188,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { uint32_t cost[4]; - int16_t __attribute__((aligned(16))) rowcol[8]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); vst1q_s16(rowcol, v_diff_mv_w); // Note: This is a use case for gather instruction @@ -260,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Find the minimum value and index horizontally in v_sad_d { uint32_t local_best_sad; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); #else uint32x2_t horiz_min_0 = @@ -282,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_idx = vminvq_u32(v_mask_d); #else horiz_min_0 = @@ -306,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, best_address = new_best_address; v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 v_ba_q = vdupq_n_s64((intptr_t)best_address); #else v_ba_d = vdupq_n_s32((intptr_t)best_address); diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c index 1c7503139..0cf0bf250 100644 --- a/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/vp9/encoder/arm/neon/vp9_error_neon.c @@ -12,30 +12,91 @@ #include <assert.h> #include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, - int block_size) { - int64x2_t error = vdupq_n_s64(0); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); - assert(block_size >= 8); - assert((block_size % 8) == 0); + assert(block_size >= 16); + assert((block_size % 16) == 0); do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err0, err1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits + // before accumulating them in 64-bits. However splitting into 2 mull, mlal + // pairs is beneficial since it allows us to use both Neon + // multiply-accumulate pipes - on CPUs that have them - rather than having + // a single chain of 4 instructions executing serially. + err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0)); + err_u64[0] = vpadalq_u32(err_u64[0], err0); + + err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1)); + err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64[1] = vpadalq_u32(err_u64[1], err1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; } while (block_size != 0); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); + return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1])); } diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c new file mode 100644 index 000000000..d9b183472 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = load_tran_low_to_s32q(coeff); + const int32x4_t d = load_tran_low_to_s32q(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift; +} diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c new file mode 100644 index 000000000..c3aef3c86 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const uint16x8_t a_reg = vld1q_u16(a); + const uint16x8_t b_reg = vld1q_u16(b); + + uint16x8_t dist = vabdq_u16(a_reg, b_reg); + uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist)); + uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist)); + + vst1q_u32(dst, dist_first); + vst1q_u32(dst + 4, dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) { + uint32x4_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u32(dist); + dist_left = vld1q_u32(dist - 1); + dist_right = vld1q_u32(dist + 1); + + *sum = vaddq_u32(dist_reg, dist_left); + *sum = vaddq_u32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first, + uint32x4_t *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum, + const uint32x4_t *mul_constants, + const int strength, const int rounding, + const int weight) { + const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32); + const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32); + const uint32x4_t weight_u32 = vdupq_n_u32(weight); + const uint32x4_t sixteen = vdupq_n_u32(16); + uint32x4_t sum2; + + // modifier * 3 / index; + uint64x2_t sum_lo = + vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants)); + uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum), + vget_high_u32(*mul_constants)); + + // we cannot use vshrn_n_u64 as strength is not known at compile time. + sum_lo = vshlq_u64(sum_lo, strength_s64); + sum_hi = vshlq_u64(sum_hi, strength_s64); + + sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi)); + + // Multiply with the weight + sum2 = vminq_u32(sum2, sixteen); + sum2 = vsubq_u32(sixteen, sum2); + *output = vmulq_u32(sum2, weight_u32); +} + +static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1, + const uint32x4_t sum_0_u32, + const uint32x4_t sum_1_u32, + const uint32x4_t *mul_constants_0, + const uint32x4_t *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8( + const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32, + const uint16_t *pred, uint16_t *count, uint32_t *accumulator) { + const uint16x8_t sum_u16 = + vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32)); + uint16x8_t pred_u16 = vld1q_u16(pred); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t pred_0_u32, pred_1_u32; + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16)); + pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16)); + + // Don't use sum_u16 as that produces different results to the C version + accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32); + accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, + uint32x4_t *dist_reg) { + *dist_reg = vld1q_u32(dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, + uint32x4_t *reg_first, + uint32x4_t *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, + uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first, + uint32x4_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint32x4_t u_reg, v_reg; + uint32x4x2_t pair; + + highbd_read_dist_4(u_dist, &u_reg); + + pair = vzipq_u32(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + highbd_read_dist_4(v_dist, &v_reg); + + pair = vzipq_u32(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +static void highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_first, mul_second; + + uint32x4_t sum_row_1_first, sum_row_1_second; + uint32x4_t sum_row_2_first, sum_row_2_second; + uint32x4_t sum_row_3_first, sum_row_3_second; + + uint32x4_t u_first, u_second; + uint32x4_t v_first, v_second; + + uint32x4_t sum_row_first; + uint32x4_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u32(neighbors_first[1]); + mul_second = vld1q_u32(neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst, + uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) { + uint32x4_t y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst); + y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + uint32x4_t y_fst, y_snd; + uint64x2_t y_fst64, y_snd64; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + } + + *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst); + *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd); + *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst); + *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_fst, mul_snd; + + uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + uint32x4_t u_sum_row_fst, v_sum_row_fst; + uint32x4_t u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = vld1q_u32(neighbors_fst[1]); + mul_snd = vld1q_u32(neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_neon( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index c2b55fcba..96d061436 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -11,11 +11,13 @@ #include <arm_neon.h> #include <assert.h> #include <math.h> +#include <stdint.h> #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, } static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = @@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } -static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - int16x8_t *round, int16x8_t *quant, - int16x8_t *dequant) { - *round = vld1q_s16(round_ptr); - *quant = vld1q_s16(quant_ptr); +static VPX_FORCE_INLINE void load_fp_values( + const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) { + *round = vld1q_s16(mb_plane->round_fp); + *quant = vld1q_s16(mb_plane->quant_fp); *dequant = vld1q_s16(dequant_ptr); } static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, int16x8_t *v_quant, int16x8_t *v_dequant) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *v_round = vdupq_laneq_s16(*v_round, 1); *v_quant = vdupq_laneq_s16(*v_quant, 1); *v_dequant = vdupq_laneq_s16(*v_dequant, 1); @@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, const int16_t *quant_ptr, +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; int16x8_t v_eobmax = vdupq_n_s16(-1); int16x8_t v_round, v_quant, v_dequant; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant, - &v_dequant); + load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant); // process dc and the first seven ac coeffs quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &v_eobmax); // now process the rest of the ac coeffs update_fp_values(&v_round, &v_quant, &v_dequant); - for (i = 8; i < count; i += 8) { + for (i = 8; i < n_coeffs; i += 8) { quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); } @@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, - const int16_t *quant_ptr, +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int16x8_t eob_max = vdupq_n_s16(-1); // ROUND_POWER_OF_TWO(round_ptr[], 1) - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant_fp); int16x8_t dequant = vld1q_s16(dequant_ptr); // dequant >> 2 is used similar to zbin as a threshold. int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); int i; + const int16_t *iscan = scan_order->iscan; - (void)scan; - (void)count; + (void)n_coeffs; // Process dc and the first seven ac coeffs. quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, @@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x4_t v_zero = vdup_n_s16(0); - const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); - const int16x4_t v_round = vld1_s16(round_ptr); + const int16x4_t v_round = vld1_s16(mb_plane->round_fp); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, @@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - const int16x4_t v_quant = vld1_s16(quant_ptr); + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const int16x4_t v_round = - vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14)); + vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14)); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c new file mode 100644 index 000000000..a651a15d9 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x8_t a_reg = vld1_u8(a); + const uint8x8_t b_reg = vld1_u8(b); + + uint16x8_t dist_first = vabdl_u8(a_reg, b_reg); + dist_first = vmulq_u16(dist_first, dist_first); + + vst1q_u16(dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x16_t a_reg = vld1q_u8(a); + const uint8x16_t b_reg = vld1q_u8(b); + + uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg)); + uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg)); + dist_first = vmulq_u16(dist_first, dist_first); + dist_second = vmulq_u16(dist_second, dist_second); + + vst1q_u16(dst, dist_first); + vst1q_u16(dst + 8, dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) { + *dist_reg = vld1q_u16(dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first, + uint16x8_t *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE uint16x8_t average_8(uint16x8_t sum, + const uint16x8_t *mul_constants, + const int strength, const int rounding, + const uint16x8_t *weight) { + const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16); + const uint16x8_t weight_u16 = *weight; + const uint16x8_t sixteen = vdupq_n_u16(16); + const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16); + + // modifier * 3 / index; + uint32x4_t sum_hi = + vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants)); + uint32x4_t sum_lo = + vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants)); + + sum_lo = vqaddq_u32(sum_lo, rounding_u32); + sum_hi = vqaddq_u32(sum_hi, rounding_u32); + + // we cannot use vshrn_n_u32 as strength is not known at compile time. + sum_lo = vshlq_u32(sum_lo, strength_u32); + sum_hi = vshlq_u32(sum_hi, strength_u32); + + sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo)); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = vminq_u16(sum, sixteen); + sum = vsubq_u16(sixteen, sum); + return vmulq_u16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const uint16x8_t sum_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred)); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16)); + accum_1_u32 = + vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16, + const uint16x8_t sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint8x16_t pred_u8 = vld1q_u8(pred); + uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8)); + uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8)); + uint16x8_t count_0_u16 = vld1q_u16(count); + uint16x8_t count_1_u16 = vld1q_u16(count + 8); + uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16); + vst1q_u16(count, count_0_u16); + count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16); + vst1q_u16(count + 8, count_1_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + accum_2_u32 = vld1q_u32(accumulator + 8); + accum_3_u32 = vld1q_u32(accumulator + 12); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16)); + accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16), + vget_high_u16(pred_0_u16)); + accum_2_u32 = + vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16)); + accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16), + vget_high_u16(pred_1_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); + vst1q_u32(accumulator + 8, accum_2_u32); + vst1q_u32(accumulator + 12, accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) { + uint16x8_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u16(y_dist); + dist_left = vld1q_u16(y_dist - 1); + dist_right = vld1q_u16(y_dist + 1); + + *sum = vqaddq_u16(dist_reg, dist_left); + *sum = vqaddq_u16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first, + uint16x8_t *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + uint16x8_t *u_first, + uint16x8_t *u_second, + uint16x8_t *v_first, + uint16x8_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint16x8_t u_reg, v_reg; + uint16x8x2_t pair; + + read_dist_8(u_dist, &u_reg); + + pair = vzipq_u16(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + read_dist_8(v_dist, &v_reg); + + pair = vzipq_u16(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + uint16x8_t *u_mod, + uint16x8_t *v_mod) { + uint16x8_t y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + uint16x8_t y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = vqaddq_u16(y_reg, y_tmp); + } + } else { + uint16x8_t y_first, y_second; + uint32x4_t y_first32, y_second32; + + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + uint16x8_t y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = vqaddq_u16(y_first, y_tmp_0); + y_second = vqaddq_u16(y_second, y_tmp_1); + } + + y_first32 = vpaddlq_u16(y_first); + y_second32 = vpaddlq_u16(y_second); + + y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32)); + } + + *u_mod = vqaddq_u16(*u_mod, y_reg); + *v_mod = vqaddq_u16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + uint16x8_t weight_first, weight_second; + + uint16x8_t mul_first, mul_second; + + uint16x8_t sum_row_1_first, sum_row_1_second; + uint16x8_t sum_row_2_first, sum_row_2_second; + uint16x8_t sum_row_3_first, sum_row_3_second; + + uint16x8_t u_first, u_second; + uint16x8_t v_first, v_second; + + uint16x8_t sum_row_first; + uint16x8_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[0]); + weight_second = vdupq_n_u16(blk_fw[1]); + } else { + weight_first = vdupq_n_u16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[2]); + weight_second = vdupq_n_u16(blk_fw[3]); + } else { + weight_first = vdupq_n_u16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The block width is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + uint16x8_t weight; + + uint16x8_t mul; + + uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3; + uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3; + + uint16x8_t u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1])); + } else { + weight = vdupq_n_u16(top_weight); + } + + // First row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = vld1q_u16((const uint16_t *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3])); + } else { + weight = vdupq_n_u16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_neon( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk, + y_accum, y_count, y_dist_ptr, u_dist_ptr, + v_dist_ptr); + + apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, u_accum, u_count, v_accum, + v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c index bd3812036..ef3423f8e 100644 --- a/vp9/encoder/vp9_aq_complexity.c +++ b/vp9/encoder/vp9_aq_complexity.c @@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { &cpi->rc, cm->frame_type, cm->base_qindex, aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); - // For AQ complexity mode, we dont allow Q0 in a segment if the base + // For AQ complexity mode, we don't allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment // Q delta is sometimes applied without going back around the rd loop. // This could lead to an illegal combination of partition size and q. diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index a84c8b524..ca56d14aa 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, vpx_write_bit(w, p->extra & 1); } else { // t >= TWO_TOKEN && t < EOB_TOKEN const struct vp9_token *const a = &vp9_coef_encodings[t]; - const int v = a->value; - const int n = a->len; + int v = a->value; + int n = a->len; const int e = p->extra; vpx_write(w, 1, context_tree[2]); vp9_write_tree(w, vp9_coef_con_tree, @@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, if (t >= CATEGORY1_TOKEN) { const vp9_extra_bit *const b = &extra_bits[t]; const unsigned char *pb = b->prob; - int v = e >> 1; - int n = b->len; // number of bits in v, assumed nonzero + v = e >> 1; + n = b->len; // number of bits in v, assumed nonzero do { const int bb = (v >> --n) & 1; vpx_write(w, bb, *pb++); @@ -599,7 +599,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; - const vpx_prob upd = DIFF_UPDATE_PROB; int64_t s; int u = 0; if (t == PIVOT_NODE) @@ -968,13 +967,13 @@ static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) { int i; const size_t worker_data_size = cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); - CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data, + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data, vpx_memalign(16, worker_data_size)); memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); for (i = 1; i < cpi->num_workers; ++i) { cpi->vp9_bitstream_worker_data[i].dest_size = cpi->oxcf.width * cpi->oxcf.height; - CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest, + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest, vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size)); } } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 20294b4b9..7fa00cd19 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,6 +13,7 @@ #include "vpx_util/vpx_thread.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -24,7 +25,7 @@ typedef struct { unsigned int sse; int sum; unsigned int var; -} diff; +} Diff; struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); @@ -33,8 +34,8 @@ struct macroblock_plane { uint16_t *eobs; struct buf_2d src; - // Quantizer setings - DECLARE_ALIGNED(16, int16_t, round_fp[8]); + // Quantizer settings + int16_t *round_fp; int16_t *quant_fp; int16_t *quant; int16_t *quant_shift; @@ -78,16 +79,16 @@ struct macroblock { int skip_recode; int skip_optimize; int q_index; - int block_qcoeff_opt; + double log_block_src_var; int block_tx_domain; // The equivalent error at the current rdmult of one whole bit (not one // bitcost unit). int errorperbit; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for large blocks. int sadperbit16; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for sub-8x8 blocks. int sadperbit4; int rddiv; @@ -127,7 +128,7 @@ struct macroblock { // from extending outside the UMV borders MvLimits mv_limits; - // Notes transform blocks where no coefficents are coded. + // Notes transform blocks where no coefficients are coded. // Set during mode selection. Read during block encoding. uint8_t zcoeff_blk[TX_SIZES][256]; diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c index b74b9027c..42073f756 100644 --- a/vp9/encoder/vp9_context_tree.c +++ b/vp9/encoder/vp9_context_tree.c @@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, int i, k; ctx->num_4x4_blk = num_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t))); + CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk, + vpx_calloc(num_blk, sizeof(uint8_t))); for (i = 0; i < MAX_MB_PLANE; ++i) { for (k = 0; k < 3; ++k) { - CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k], vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k]))); ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; @@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { int nodes; vpx_free(td->leaf_tree); - CHECK_MEM_ERROR(cm, td->leaf_tree, + CHECK_MEM_ERROR(&cm->error, td->leaf_tree, vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree))); vpx_free(td->pc_tree); - CHECK_MEM_ERROR(cm, td->pc_tree, + CHECK_MEM_ERROR(&cm->error, td->pc_tree, vpx_calloc(tree_nodes, sizeof(*td->pc_tree))); this_pc = &td->pc_tree[0]; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 77d72396a..e5dffa90a 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; - set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); + set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME); vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); // Restore everything to its original state @@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv); // No need to keep checking 8x8 blocks if any of the sub-blocks // has small consec_zeromv (since threshold for no_skin based on - // zero/small motion in skin detection is high, i.e, > 4). + // zero/small motion in skin detection is high, i.e., > 4). if (consec_zeromv < 4) { i = ymis; break; @@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; denoiser->num_layers = num_layers; - CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y, vpx_calloc(denoiser->num_ref_frames * num_layers, sizeof(denoiser->running_avg_y[0]))); CHECK_MEM_ERROR( - cm, denoiser->mc_running_avg_y, + &cm->error, denoiser->mc_running_avg_y, vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); for (layer = 0; layer < num_layers; ++layer) { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1483ac069..7ff5f00ed 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -349,17 +349,17 @@ typedef struct { int32_t sum_error; int log2_count; int variance; -} var; +} Var; typedef struct { - var none; - var horz[2]; - var vert[2]; + Var none; + Var horz[2]; + Var vert[2]; } partition_variance; typedef struct { partition_variance part_variances; - var split[4]; + Var split[4]; } v4x4; typedef struct { @@ -384,7 +384,7 @@ typedef struct { typedef struct { partition_variance *part_variances; - var *split[4]; + Var *split[4]; } variance_node; typedef enum { @@ -436,13 +436,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } // Set variance values given sum square error, sum error, count. -static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { +static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; } -static void get_variance(var *v) { +static void get_variance(Var *v) { v->variance = (int)(256 * (v->sum_square_error - (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> @@ -450,7 +450,7 @@ static void get_variance(var *v) { v->log2_count); } -static void sum_2_variances(const var *a, const var *b, var *r) { +static void sum_2_variances(const Var *a, const Var *b, Var *r) { assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->log2_count + 1, r); @@ -1301,6 +1301,13 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, (frame_is_intra_only(cm) || (is_one_pass_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); + + if (!is_key_frame) { + if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE) + is_key_frame = 1; + } + // Always use 4x4 partition for key frame. const int use_4x4_partition = frame_is_intra_only(cm); const int low_res = (cm->width <= 352 && cm->height <= 288); @@ -1437,7 +1444,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, &cm->frame_refs[LAST_FRAME - 1].sf); mi->ref_frame[0] = LAST_FRAME; } - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; @@ -1545,7 +1552,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (low_res && threshold_4x4avg < INT64_MAX) - CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2))); + CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2))); // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. for (i = 0; i < 4; i++) { @@ -1706,7 +1713,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int y16_idx = ((j >> 1) << 1); // For inter frames: if variance4x4downsample[] == 1 for this 16x16 // block, then the variance is based on 4x4 down-sampling, so use vt2 - // in set_vt_partioning(), otherwise use vt. + // in set_vt_partitioning(), otherwise use vt. v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1) ? &vt2[i2 + j] : &vt.split[i].split[j]; @@ -1863,8 +1870,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, vp9_update_mv_count(td); if (cm->interp_filter == SWITCHABLE) { - const int ctx = get_pred_context_switchable_interp(xd); - ++td->counts->switchable_interp[ctx][xdmi->interp_filter]; + const int ctx_interp = get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter]; } } @@ -1924,7 +1931,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, mi->skip = 1; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; mi->interp_filter = filter_ref; @@ -1980,6 +1987,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t best_rd = INT64_MAX; vpx_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -2018,20 +2028,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; - if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) { + if ((cpi->sf.tx_domain_thresh > 0.0) || + (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) { double logvar = vp9_log_block_var(cpi, x, bsize); - // Check block complexity as part of descision on using pixel or transform + // Check block complexity as part of decision on using pixel or transform // domain distortion in rd tests. x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion && (logvar >= cpi->sf.tx_domain_thresh); - // Check block complexity as part of descision on using quantized - // coefficient optimisation inside the rd loop. - x->block_qcoeff_opt = - cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh); + // Store block complexity to decide on using quantized coefficient + // optimization inside the rd loop. + x->log_block_src_var = logvar; } else { x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion; - x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt; + x->log_block_src_var = 0.0; } set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); @@ -2047,15 +2057,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif } } @@ -2078,6 +2100,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif } #endif // !CONFIG_REALTIME_ONLY @@ -2414,16 +2439,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi, (row8x8_remaining >= MI_BLOCK_SIZE)) { int i, j; int index; - diff d32[4]; + Diff d32[4]; const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); int is_larger_better = 0; int use32x32 = 0; unsigned int thr = cpi->source_var_thresh; - memset(d32, 0, 4 * sizeof(diff)); + memset(d32, 0, sizeof(d32)); for (i = 0; i < 4; i++) { - diff *d16[4]; + Diff *d16[4]; for (j = 0; j < 4; j++) { int b_mi_row = coord_lookup[i * 4 + j].row; @@ -2730,10 +2755,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX, @@ -2754,10 +2779,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0]; + PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx); pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; rd_pick_sb_modes( cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, @@ -2829,8 +2854,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, int x_idx = (i & 1) * (mi_step >> 1); int y_idx = (i >> 1) * (mi_step >> 1); RD_COST tmp_rdc; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -3036,14 +3059,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, min_size = BLOCK_64X64; max_size = BLOCK_4X4; - if (prev_mi) { - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - mi = prev_mi[idy * cm->mi_stride + idx]; - bs = mi ? mi->sb_type : bsize; - min_size = VPXMIN(min_size, bs); - max_size = VPXMAX(max_size, bs); - } + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); } } @@ -3189,7 +3210,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, left_par = 1; } - if (prev_mi) { + if (prev_mi[0]) { context_size = prev_mi[0]->sb_type; if (context_size < bsize) last_par = 2; @@ -3422,18 +3443,23 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; MV best_mv = { 0, 0 }; int cost_list[5]; + struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } }; - if (scaled_ref_frame) + if (scaled_ref_frame) { yv12 = scaled_ref_frame; - else + // As reported in b/311294795, the reference buffer pointer needs to be + // saved and restored after the search. Otherwise, it causes problems while + // the reference frame scaling happens. + for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0]; + } else { yv12 = get_ref_frame_buffer(cpi, ref); + } assert(yv12 != NULL); if (!yv12) return; - vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, - &cm->frame_refs[ref - 1].sf); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL); mi->ref_frame[0] = ref; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = bsize; vp9_set_mv_search_range(&x->mv_limits, &ref_mv); vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, @@ -3444,6 +3470,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, x->mv_limits = tmp_mv_limits; mi->mv[0].as_mv = best_mv; + // Restore reference buffer pointer. + if (scaled_ref_frame) { + for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i]; + } + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); xd->plane[0].dst.buf = pred_buf; xd->plane[0].dst.stride = 64; @@ -3454,15 +3485,15 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, // Features used: QP; spatial block size contexts; variance of prediction // residue after simple_motion_search. #define FEATURES 12 -static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, - MACROBLOCK *const x, - PC_TREE *const pc_tree, - BLOCK_SIZE bsize, int mi_row, - int mi_col, int *none, int *split) { +static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { const VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; + const MACROBLOCKD *const xd = &x->e_mbd; #if CONFIG_VP9_HIGHBITDEPTH - MACROBLOCKD *xd = &x->e_mbd; DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? (CONVERT_TO_BYTEPTR(pred_buffer)) @@ -3545,7 +3576,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, const unsigned int var = cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); - const MACROBLOCKD *const xd = &x->e_mbd; const int has_above = !!xd->above_mi; const int has_left = !!xd->left_mi; const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize; @@ -3695,7 +3725,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int row, col; int dr = 0; - int count = 0; double r0, rk, beta; TplDepFrame *tpl_frame; @@ -3719,8 +3748,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, intra_cost += this_stats->intra_cost; mc_dep_cost += this_stats->mc_dep_cost; - - ++count; } } @@ -3777,7 +3804,7 @@ static void assign_motion_vector_info(const int block_width_4x4, const int col_4x4 = col_start_4x4 + j; const int unit_index = row_4x4 * num_unit_cols + col_4x4; if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue; - if (source_ref_frame[1] == NONE) { + if (source_ref_frame[1] == NO_REF_FRAME) { assert(source_mv[1]->row == 0 && source_mv[1]->col == 0); } motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0]; @@ -4080,8 +4107,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; if (do_rd_ml_partition_var_pruning) { - ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col, - &partition_none_allowed, &do_split); + ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); } else { vp9_zero(pc_tree->mv); } @@ -4330,9 +4357,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) { - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; @@ -4407,12 +4434,31 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) { + vp9_rd_cost_reset(&this_rdc); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64, + ctx, INT_MAX, INT64_MAX); + ctx->rdcost = this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_NONE; + } + } + *rd_cost = best_rdc; if (should_encode_sb && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif #if CONFIG_RATE_CTRL if (oxcf->use_simple_encode_api) { // Store partition, motion vector of the superblock. @@ -4539,8 +4585,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, &x->min_partition_size, &x->max_partition_size); } td->pc_root->none.rdcost = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, dummy_rdc, td->pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif } (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, num_sb_cols); @@ -4672,6 +4725,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + x->skip_recode = 0; + mi = xd->mi[0]; mi->sb_type = bsize; @@ -4795,9 +4850,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { #define FEATURES 6 #define LABELS 2 -static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_row, - int mi_col) { +static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; @@ -4929,7 +4984,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (partition_none_allowed || do_split) do_rect = 0; if (partition_none_allowed && do_split) { const int ml_predicted_partition = - ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col); + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); if (ml_predicted_partition == PARTITION_NONE) do_split = 0; if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; } @@ -5418,7 +5473,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, &cm->frame_refs[LAST_FRAME - 1].sf); mi->ref_frame[0] = LAST_FRAME; } - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; @@ -5608,7 +5663,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - // Use lower max_partition_size for low resoultions. + // Use lower max_partition_size for low resolutions. if (cm->width <= 352 && cm->height <= 288) x->max_partition_size = BLOCK_32X32; else @@ -5650,12 +5705,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } // end RTC play code -static INLINE uint32_t variance(const diff *const d) { +static INLINE uint32_t variance(const Diff *const d) { return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8); } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE uint32_t variance_highbd(diff *const d) { +static INLINE uint32_t variance_highbd(Diff *const d) { const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8); return (var >= 0) ? (uint32_t)var : 0; } @@ -5675,7 +5730,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]); - diff *var16 = cpi->source_diff_var; + Diff *var16 = cpi->source_diff_var; int sum = 0; int i, j; @@ -5758,8 +5813,8 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) { if (cm->last_width != cm->width || cm->last_height != cm->height) { if (cpi->source_diff_var) vpx_free(cpi->source_diff_var); - CHECK_MEM_ERROR(cm, cpi->source_diff_var, - vpx_calloc(cm->MBs, sizeof(diff))); + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); } if (!cpi->frames_till_next_var_check) @@ -5798,7 +5853,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); CHECK_MEM_ERROR( - cm, cpi->tile_data, + &cm->error, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; @@ -5807,20 +5862,15 @@ void vp9_init_tile_data(VP9_COMP *cpi) { TileDataEnc *tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; int i, j; + const MV zero_mv = { 0, 0 }; for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; -#endif // CONFIG_CONSISTENT_RECODE tile_data->mode_map[i][j] = j; } } + tile_data->firstpass_top_mv = zero_mv; #if CONFIG_MULTITHREAD tile_data->row_base_thresh_freq_fact = NULL; #endif @@ -6037,9 +6087,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; -#if CONFIG_CONSISTENT_RECODE x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; -#endif if (xd->lossless) x->optimize = 0; x->sharpness = cpi->oxcf.sharpness; x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); @@ -6108,6 +6156,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE) + cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame); + } + } + // Frame segmentation if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi); @@ -6166,7 +6223,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { int mi_row, mi_col; int sum_delta = 0; - int map_index = 0; int qdelta_index; int segment_id; @@ -6176,7 +6232,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { segment_id = mi_8x8[0]->segment_id; qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); sum_delta += qdelta_index; - map_index++; } mi_8x8_ptr += cm->mi_stride; } @@ -6184,13 +6239,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void restore_encode_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -6201,35 +6254,19 @@ static void restore_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = - tile_data->thresh_freq_fact_prev[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev); } cm->interp_filter = cpi->sf.default_interp_filter; } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - restore_encode_params(cpi); - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE restore_encode_params(cpi); -#endif #if CONFIG_MISMATCH_DEBUG mismatch_reset_frame(MAX_MB_PLANE); @@ -6283,7 +6320,13 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (cm->interp_filter == SWITCHABLE) cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_internal_time); +#endif encode_frame_internal(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_internal_time); +#endif for (i = 0; i < REFERENCE_MODES; ++i) mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dc..eded9f5c4 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -26,6 +26,7 @@ #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" @@ -78,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const int shift = (tx_size == TX_32X32); const int16_t *const dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, plane_type, block); + const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; const MODE_INFO *mbmi = xd->mi[0]; @@ -350,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -366,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } return; @@ -397,26 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } } @@ -495,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -511,28 +507,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } return; @@ -542,28 +534,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } } @@ -759,10 +747,23 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, MODE_INFO *mi = xd->mi[0]; int plane; #if CONFIG_MISMATCH_DEBUG - struct encode_b_args arg = { x, 1, NULL, NULL, + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context &mi->skip, mi_row, mi_col, output_enabled }; #else - struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip }; + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context + &mi->skip }; (void)mi_row; (void)mi_col; (void)output_enabled; @@ -780,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); - arg.enable_coeff_opt = 1; + arg.enable_trellis_opt = 1; } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } arg.ta = ctx.ta[plane]; arg.tl = ctx.tl[plane]; @@ -804,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const scan_order *scan_order; + const ScanOrder *scan_order; TX_TYPE tx_type = DCT_DCT; PREDICTION_MODE mode; const int bwl = b_width_log2_lookup[plane_bsize]; @@ -814,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, uint16_t *eob = &p->eobs[block]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; + int enable_trellis_opt = !x->skip_recode; ENTROPY_CONTEXT *a = NULL; ENTROPY_CONTEXT *l = NULL; int entropy_ctx = 0; dst = &pd->dst.buf[4 * (row * dst_stride + col)]; src = &p->src.buf[4 * (row * src_stride + col)]; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; - if (args->enable_coeff_opt) { - a = &args->ta[col]; - l = &args->tl[row]; - entropy_ctx = combine_entropy_contexts(*a, *l); - } if (tx_size == TX_4X4) { tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block); @@ -848,20 +845,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // skip block condition should be handled before this is called. assert(!x->skip_block); + if (!x->skip_recode) { + const int tx_size_in_pixels = (1 << tx_size) << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride, + xd->bd); + } else { + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); + } +#else + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); +#endif + enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col, + plane_bsize, tx_size, args); + } + + if (enable_trellis_opt) { + a = &args->ta[col]; + l = &args->tl[row]; + entropy_ctx = combine_entropy_contexts(*a, *l); + } + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -870,17 +889,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_16X16: if (!x->skip_recode) { - vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -890,17 +906,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_8X8: if (!x->skip_recode) { - vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -911,17 +924,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, default: assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -945,14 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, - dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -960,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_16X16: if (!x->skip_recode) { - vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -975,14 +979,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_8X8: if (!x->skip_recode) { - vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -991,17 +992,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, default: assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, - dst_stride); if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -1019,28 +1017,43 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b) { + int enable_trellis_opt) { const MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; #if CONFIG_MISMATCH_DEBUG // TODO(angiebird): make mismatch_debug support intra mode struct encode_b_args arg = { - x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0, - 0 + x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled }; #else - struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane], - ctx.tl[plane], &xd->mi[0]->skip }; + struct encode_b_args arg = { x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip }; #endif - if (enable_optimize_b && x->optimize && + if (enable_trellis_opt && x->optimize && (!x->skip_recode || !x->skip_optimize)) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE tx_size = plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } vp9_foreach_transformed_block_in_plane(xd, bsize, plane, diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 1975ee73a..1391446be 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -20,7 +20,10 @@ extern "C" { struct encode_b_args { MACROBLOCK *x; - int enable_coeff_opt; + int enable_trellis_opt; + double trellis_opt_thresh; + int *sse_calc_done; + int64_t *sse; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; int8_t *skip; @@ -48,7 +51,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b); + int enable_trellis_opt); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e38507754..152d42bc9 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -12,6 +12,7 @@ #include <math.h> #include <stdio.h> #include <stdlib.h> +#include <string.h> #include "./vp9_rtcd.h" #include "./vpx_config.h" @@ -23,6 +24,7 @@ #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" @@ -32,18 +34,15 @@ #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" -#if CONFIG_NON_GREEDY_MV -#include "vp9/common/vp9_mvref_common.h" -#endif #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_tile_common.h" -#include "vp9/common/vp9_scan.h" #if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" @@ -81,8 +80,11 @@ #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_tpl_model.h" #include "vp9/vp9_cx_iface.h" +#include "vpx/vpx_ext_ratectrl.h" + #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 @@ -126,13 +128,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); -#endif -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); - #if !CONFIG_REALTIME_ONLY // compute adaptive threshold for skip recoding static int compute_context_model_thresh(const VP9_COMP *const cpi) { @@ -148,7 +143,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) { // frame context probability model is less than a certain threshold. // The first component is the most critical part to guarantee adaptivity. // Other parameters are estimated based on normal setting of hd resolution - // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 const int thresh = ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * qindex_factor) >> @@ -502,22 +497,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { "Too many reference buffers are used." }; -static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { +static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; default: - assert(mode == ONETWO); + assert(mode == VP8E_ONETWO); *hr = 1; *hs = 2; break; @@ -690,9 +685,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } -int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[8], int delta_lf[8], - int skip[8], int ref_frame[8]) { +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]) { VP9_COMMON *cm = &cpi->common; vpx_roi_map_t *roi = &cpi->roi; const int range = 63; @@ -703,13 +699,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, // Check number of rows and columns match if (frame_rows != (int)rows || frame_cols != (int)cols) { - return -1; + return VPX_CODEC_INVALID_PARAM; } if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || !check_seg_range(ref_frame, ref_frame_range) || !check_seg_range(skip, skip_range)) - return -1; + return VPX_CODEC_INVALID_PARAM; // Also disable segmentation if no deltas are specified. if (!map || @@ -723,14 +719,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, ref_frame[6] == -1 && ref_frame[7] == -1))) { vp9_disable_segmentation(&cm->seg); cpi->roi.enabled = 0; - return 0; + return VPX_CODEC_OK; } if (roi->roi_map) { vpx_free(roi->roi_map); roi->roi_map = NULL; } - CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols)); + roi->roi_map = vpx_malloc(rows * cols); + if (!roi->roi_map) return VPX_CODEC_MEM_ERROR; // Copy to ROI structure in the compressor. memcpy(roi->roi_map, map, rows * cols); @@ -742,7 +739,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, roi->rows = rows; roi->cols = cols; - return 0; + return VPX_CODEC_OK; } int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, @@ -886,10 +883,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) { if (!cm->prev_mip) return 1; cm->mi_alloc_size = mi_size; - cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + cm->mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); if (!cm->mi_grid_base) return 1; cm->prev_mi_grid_base = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); if (!cm->prev_mi_grid_base) return 1; return 0; @@ -1383,7 +1381,7 @@ static void alloc_context_buffers_ext(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int mi_size = cm->mi_cols * cm->mi_rows; - CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base, + CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base, vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } @@ -1402,14 +1400,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) { { unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], + CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0], vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; vpx_free(cpi->tplist[0][0]); CHECK_MEM_ERROR( - cm, cpi->tplist[0][0], + &cm->error, cpi->tplist[0][0], vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0]))); vp9_setup_pc_tree(&cpi->common, &cpi->td); @@ -1571,13 +1569,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; +#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -1637,284 +1637,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, - vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, - vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, - vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, - vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, - vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, - vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits8) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, - vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, - vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits8) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, - vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, - vpx_highbd_8_sub_pixel_variance32x32, - vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, - vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, - vpx_highbd_8_sub_pixel_variance64x64, - vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits8) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, - vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, - vpx_highbd_8_sub_pixel_variance16x16, - vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, - vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, - vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits8) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, - vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, - vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits8) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits8, + vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8, + vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits8, + vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8, + vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits8, + vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8, + vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits8, + vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8, + vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8) HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8) + BLOCK_32X32, vpx_highbd_sad32x32_bits8, + vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8, + vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8) HIGHBD_BFP( - BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8) + BLOCK_64X64, vpx_highbd_sad64x64_bits8, + vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8, + vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8) HIGHBD_BFP( - BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8) + BLOCK_16X16, vpx_highbd_sad16x16_bits8, + vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8, + vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8) HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8) + BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8, + vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8, + vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8) + + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, + vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8, + vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8) + + HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, + vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8) + + HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, + vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, + vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8, + vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8) break; case VPX_BITS_10: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, - vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, - vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, - vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, - vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, - vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, - vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits10) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, - vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, - vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits10) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, - vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, - vpx_highbd_10_sub_pixel_variance32x32, - vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, - vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, - vpx_highbd_10_sub_pixel_variance64x64, - vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits10) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, - vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, - vpx_highbd_10_sub_pixel_variance16x16, - vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, - vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, - vpx_highbd_10_sub_pixel_variance16x8, - vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, - vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, - vpx_highbd_10_sub_pixel_variance8x16, - vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits10) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, - vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, - vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, - vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, - vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits10) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, - vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, - vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits10) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, - vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, - vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits10) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits10, + vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10, + vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits10, + vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10, + vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits10, + vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10, + vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits10, + vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10, + vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits10, + vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10, + vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits10, + vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10, + vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits10, + vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10, + vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits10, + vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10, + vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits10, + vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10, + vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10) break; default: assert(cm->bit_depth == VPX_BITS_12); - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, - vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, - vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, - vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, - vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, - vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, - vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits12) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, - vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, - vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits12) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, - vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, - vpx_highbd_12_sub_pixel_variance32x32, - vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, - vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, - vpx_highbd_12_sub_pixel_variance64x64, - vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits12) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, - vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, - vpx_highbd_12_sub_pixel_variance16x16, - vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, - vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, - vpx_highbd_12_sub_pixel_variance16x8, - vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, - vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, - vpx_highbd_12_sub_pixel_variance8x16, - vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits12) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, - vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, - vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, - vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, - vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits12) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, - vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, - vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits12) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, - vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, - vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits12) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits12, + vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12, + vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits12, + vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12, + vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits12, + vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12, + vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits12, + vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12, + vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits12, + vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12, + vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits12, + vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12, + vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits12, + vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12, + vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits12, + vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12, + vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits12, + vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12, + vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12) break; } } @@ -1926,48 +2003,48 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) { // Create the encoder segmentation map and set all entries to 0 vpx_free(cpi->segmentation_map); - CHECK_MEM_ERROR(cm, cpi->segmentation_map, + CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // Create a map used for cyclic background refresh. if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh); - CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, + CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh, vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); // Create a map used to mark inactive areas. vpx_free(cpi->active_map.map); - CHECK_MEM_ERROR(cm, cpi->active_map.map, + CHECK_MEM_ERROR(&cm->error, cpi->active_map.map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // And a place holder structure is the coding context // for use if we want to save and restore it vpx_free(cpi->coding_context.last_frame_seg_map_copy); - CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, + CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); } static void alloc_copy_partition_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (cpi->prev_partition == NULL) { - CHECK_MEM_ERROR(cm, cpi->prev_partition, + CHECK_MEM_ERROR(&cm->error, cpi->prev_partition, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(*cpi->prev_partition))); } if (cpi->prev_segment_id == NULL) { CHECK_MEM_ERROR( - cm, cpi->prev_segment_id, + &cm->error, cpi->prev_segment_id, (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->prev_segment_id))); } if (cpi->prev_variance_low == NULL) { - CHECK_MEM_ERROR(cm, cpi->prev_variance_low, + CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low, (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25, sizeof(*cpi->prev_variance_low))); } if (cpi->copied_frame_cnt == NULL) { CHECK_MEM_ERROR( - cm, cpi->copied_frame_cnt, + &cm->error, cpi->copied_frame_cnt, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->copied_frame_cnt))); } @@ -2085,13 +2162,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { vpx_free(cpi->consec_zero_mv); CHECK_MEM_ERROR( - cm, cpi->consec_zero_mv, + &cm->error, cpi->consec_zero_mv, vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); vpx_free(cpi->skin_map); CHECK_MEM_ERROR( - cm, cpi->skin_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); free_copy_partition_data(cpi); alloc_copy_partition_data(cpi); @@ -2132,18 +2209,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { vp9_set_row_mt(cpi); } -#ifndef M_LOG2_E -#define M_LOG2_E 0.693147180559945309417 -#endif -#define log2f(x) (log(x) / (float)M_LOG2_E) - /*********************************************************************** * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' * *********************************************************************** * The following 2 functions ('cal_nmvjointsadcost' and * * 'cal_nmvsadcosts') are used to calculate cost lookup tables * * used by 'vp9_diamond_search_sad'. The C implementation of the * - * function is generic, but the AVX intrinsics optimised version * + * function is generic, but the NEON intrinsics optimised version * * relies on the following properties of the computed tables: * * For cal_nmvjointsadcost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * @@ -2152,7 +2224,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * - * If these do not hold, then the AVX optimised version of the * + * If these do not hold, then the NEON optimised version of the * * 'vp9_diamond_search_sad' function cannot be used as it is, in which * * case you can revert to using the C function instead. * ***********************************************************************/ @@ -2310,7 +2382,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) { VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, BufferPool *const pool) { unsigned int i; - VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP)); + VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi)); VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; if (!cm) return NULL; @@ -2328,9 +2400,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cm->free_mi = vp9_enc_free_mi; cm->setup_mi = vp9_enc_setup_mi; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); cpi->compute_frame_low_motion_onepass = 1; @@ -2357,38 +2430,38 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); CHECK_MEM_ERROR( - cm, cpi->skin_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); #if !CONFIG_REALTIME_ONLY - CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); + CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); #endif CHECK_MEM_ERROR( - cm, cpi->consec_zero_mv, + &cm->error, cpi->consec_zero_mv, vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR( - cm, cpi->mbgraph_stats[i].mb_stats, + &cm->error, cpi->mbgraph_stats[i].mb_stats, vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } @@ -2432,7 +2505,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } if (cpi->b_calculate_consistency) { - CHECK_MEM_ERROR(cm, cpi->ssim_vars, + CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars, vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, sizeof(*cpi->ssim_vars) * 4)); cpi->worst_consistency = 100.0; @@ -2503,11 +2576,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->svc.number_temporal_layers > 1) { FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 }; - int i; + int n; - for (i = 0; i < oxcf->ss_number_layers; ++i) { + for (n = 0; n < oxcf->ss_number_layers; ++n) { FIRSTPASS_STATS *const last_packet_for_layer = - &stats[packets - oxcf->ss_number_layers + i]; + &stats[packets - oxcf->ss_number_layers + n]; const int layer_id = (int)last_packet_for_layer->spatial_layer_id; const int packets_in_layer = (int)last_packet_for_layer->count + 1; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { @@ -2517,7 +2590,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vpx_free(lc->rc_twopass_stats_in.buf); lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz; - CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf, + CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf, vpx_malloc(lc->rc_twopass_stats_in.sz)); lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf; lc->twopass.stats_in = lc->twopass.stats_in_start; @@ -2532,11 +2605,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } } - for (i = 0; i < packets; ++i) { - const int layer_id = (int)stats[i].spatial_layer_id; + for (n = 0; n < packets; ++n) { + const int layer_id = (int)stats[n].spatial_layer_id; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers && stats_copy[layer_id] != NULL) { - *stats_copy[layer_id] = stats[i]; + *stats_copy[layer_id] = stats[n]; ++stats_copy[layer_id]; } } @@ -2572,7 +2645,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, const int h = num_8x8_blocks_high_lookup[bsize]; const int num_cols = (cm->mi_cols + w - 1) / w; const int num_rows = (cm->mi_rows + h - 1) / h; - CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors, + CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors, vpx_calloc(num_rows * num_cols, sizeof(*cpi->mi_ssim_rdmult_scaling_factors))); } @@ -2581,67 +2654,76 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, #if CONFIG_NON_GREEDY_MV cpi->tpl_ready = 0; #endif // CONFIG_NON_GREEDY_MV - for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL; + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) { + cpi->tpl_stats[i].tpl_stats_ptr = NULL; + } // Allocate memory to store variances for a frame. - CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; - - BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, - vpx_sad32x16x4d) - - BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, - vpx_sad16x32x4d) - - BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, - vpx_sad64x32x4d) - - BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, - vpx_sad32x64x4d) - - BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, - vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d) - - BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, - vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x4d) - - BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, - vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x4d) - - BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, - vpx_sad16x8x4d) - - BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, - vpx_sad8x16x4d) - - BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) - - BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) - - BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) - - BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) +#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg, + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d) + + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg, + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d) + + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg, + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d) + + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg, + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d) + + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d) + + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d) + + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d) + + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d) + + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d) + + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8, + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad_skip_8x8x4d) + + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4, + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + vpx_sad_skip_8x4x4d) + + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8, + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + vpx_sad_skip_4x8x4d) + + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4, + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad_skip_4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); @@ -2689,8 +2771,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) #endif // CONFIG_INTERNAL_STATS -static void free_tpl_buffer(VP9_COMP *cpi); - void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; unsigned int i; @@ -2784,7 +2864,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, cpi->time_compress_data / 1000, @@ -2804,7 +2884,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->kmeans_data_arr); } - free_tpl_buffer(cpi); + vp9_free_tpl_buffer(cpi); vp9_loop_filter_dealloc(&cpi->lf_row_sync); vp9_bitstream_encode_tiles_buffer_dealloc(cpi); @@ -2824,6 +2904,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_extrc_delete(&cpi->ext_ratectrl); + // Help detect use after free of the error detail string. + memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1); + cm->error.detail[sizeof(cm->error.detail) - 1] = '\0'; + vp9_remove_common(cm); vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC @@ -2893,7 +2977,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) { static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) { - MV_REFERENCE_FRAME ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME; if (ref_frame_flag == VP9_LAST_FLAG) ref_frame = LAST_FRAME; else if (ref_frame_flag == VP9_GOLD_FLAG) @@ -2901,7 +2985,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( else if (ref_frame_flag == VP9_ALT_FLAG) ref_frame = ALTREF_FRAME; - return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame); + return ref_frame == NO_REF_FRAME ? NULL + : get_ref_frame_buffer(cpi, ref_frame); } int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, @@ -2994,12 +3079,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { #endif #if CONFIG_VP9_HIGHBITDEPTH -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int bd) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd) { #else -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { #endif // CONFIG_VP9_HIGHBITDEPTH // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t int i; @@ -3044,6 +3128,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. + // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less + // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both + // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition + // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that + // supports arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + const int is_arbitrary_scaling = + (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) || + (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32)); + if (is_arbitrary_scaling) { + vp9_scale_and_extend_frame_nonnormative(src, dst, bd); + return; + } + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; @@ -3352,19 +3453,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { - RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; - if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || - new_fb_ptr->mi_cols < cm->mi_cols) { - vpx_free(new_fb_ptr->mvs); - CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs))); - new_fb_ptr->mi_rows = cm->mi_rows; - new_fb_ptr->mi_cols = cm->mi_cols; - } -} - void vp9_scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; @@ -3711,7 +3799,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, case 6: l = 150; break; } if (!cpi->common.postproc_state.limits) { - CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits, + CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits, vpx_calloc(cpi->un_scaled_source->y_width, sizeof(*cpi->common.postproc_state.limits))); } @@ -3800,6 +3888,7 @@ static void set_frame_size(VP9_COMP *cpi) { alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); + int has_valid_ref_frame = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); @@ -3818,22 +3907,25 @@ static void set_frame_size(VP9_COMP *cpi) { buf->y_crop_height, cm->width, cm->height); #endif // CONFIG_VP9_HIGHBITDEPTH + has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf); if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf); } else { ref_buf->buf = NULL; } } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + vpx_internal_error( + &cm->error, VPX_CODEC_CORRUPT_FRAME, + "Can't find at least one reference frame with valid size"); + } set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void save_encode_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -3844,21 +3936,12 @@ static void save_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact_prev[i][j] = - tile_data->thresh_freq_fact[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact); } } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static INLINE void set_raw_source_frame(VP9_COMP *cpi) { #ifdef ENABLE_KF_DENOISE @@ -4005,6 +4088,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->rc.hybrid_intra_scene_change = 0; cpi->rc.re_encode_maxq_scene_change = 0; if (cm->show_frame && cpi->oxcf.mode == REALTIME && + !cpi->disable_scene_detection_rtc_ratectrl && (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.content == VP9E_CONTENT_SCREEN || (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) @@ -4067,7 +4151,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, svc->spatial_layer_id == svc->number_spatial_layers - 2) { if (svc->prev_partition_svc == NULL) { CHECK_MEM_ERROR( - cm, svc->prev_partition_svc, + &cm->error, svc->prev_partition_svc, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(*svc->prev_partition_svc))); } @@ -4419,10 +4503,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL - const FRAME_UPDATE_TYPE update_type = - cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; - const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); - RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type]; + RATE_QSTEP_MODEL *rq_model; + { + const FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); + rq_model = &cpi->rq_model[frame_type]; + } init_rq_history(rq_history); #endif // CONFIG_RATE_CTRL @@ -4438,6 +4525,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest (cpi->twopass.gf_group.index == 1) : 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif do { vpx_clear_system_state(); @@ -4525,7 +4615,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -4825,6 +4916,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) if (loop) restore_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif } while (loop); rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; @@ -4922,13 +5016,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, filter_type, phase_scaler); else - scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled, + (int)cm->bit_depth); #else if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else - scale_and_extend_frame_nonnormative(unscaled, scaled); + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { @@ -4980,8 +5075,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) { #ifdef ENABLE_KF_DENOISE // Baseline kernel weights for denoise -static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, +static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; +static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 }; static INLINE void add_denoise_point(int centre_val, int data_val, int thresh, @@ -4998,17 +5093,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; @@ -5016,19 +5111,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride, // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -5043,17 +5138,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint16_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; @@ -5061,19 +5156,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -5260,7 +5355,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) { cpi->mb_wiener_variance = NULL; CHECK_MEM_ERROR( - cm, cpi->mb_wiener_variance, + &cm->error, cpi->mb_wiener_variance, vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance))); cpi->mb_wiener_var_rows = cm->mb_rows; cpi->mb_wiener_var_cols = cm->mb_cols; @@ -5319,16 +5414,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) { vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size, xd->bd); - highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } else { vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } #else vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); #endif // CONFIG_VP9_HIGHBITDEPTH coeff[0] = 0; @@ -5447,26 +5542,7 @@ static void encode_frame_to_data_rate( struct segmentation *const seg = &cm->seg; TX_SIZE t; - // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. - // No need to set svc.skip_enhancement_layer if whole superframe will be - // dropped. - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - cpi->oxcf.target_bandwidth == 0 && - !(cpi->svc.framedrop_mode != LAYER_DROP && - (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || - cpi->svc - .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - - 1]) && - cpi->svc.drop_spatial_layer[0])) { - cpi->svc.skip_enhancement_layer = 1; - vp9_rc_postencode_update_drop_frame(cpi); - cpi->ext_refresh_frame_flags_pending = 0; - cpi->last_frame_dropped = 1; - cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; - vp9_inc_frame_in_layer(cpi); - return; - } + if (vp9_svc_check_skip_enhancement_layer(cpi)) return; set_ext_overrides(cpi); vpx_clear_system_state(); @@ -5484,6 +5560,11 @@ static void encode_frame_to_data_rate( set_ref_sign_bias(cpi); } + // On the very first frame set the deadline_mode_previous_frame to + // the current mode. + if (cpi->common.current_video_frame == 0) + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; + // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -5531,16 +5612,11 @@ static void encode_frame_to_data_rate( memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif -#if CONFIG_CONSISTENT_RECODE // Backup to ensure consistency between recodes save_encode_params(cpi); -#elif CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - save_encode_params(cpi); - } -#endif if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; @@ -5572,8 +5648,14 @@ static void encode_frame_to_data_rate( #if !CONFIG_REALTIME_ONLY #if CONFIG_RATE_CTRL encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history); -#else // CONFIG_RATE_CTRL +#else // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif encode_with_recode_loop(cpi, size, dest); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif #endif // CONFIG_RATE_CTRL #endif // !CONFIG_REALTIME_ONLY } @@ -5632,15 +5714,28 @@ static void encode_frame_to_data_rate( cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loopfilter_frame_time); +#endif // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loopfilter_frame_time); +#endif if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_pack_bitstream_time); +#endif // build the bitstream vp9_pack_bitstream(cpi, dest, size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_pack_bitstream_time); +#endif - if (cpi->ext_ratectrl.ready) { + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( @@ -6228,1391 +6323,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } -typedef struct GF_PICTURE { - YV12_BUFFER_CONFIG *frame; - int ref_frame[3]; - FRAME_UPDATE_TYPE update_type; -} GF_PICTURE; - -static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, - const GF_GROUP *gf_group, int *tpl_group_frames) { - VP9_COMMON *cm = &cpi->common; - int frame_idx = 0; - int i; - int gld_index = -1; - int alt_index = -1; - int lst_index = -1; - int arf_index_stack[MAX_ARF_LAYERS]; - int arf_stack_size = 0; - int extend_frame_count = 0; - int pframe_qindex = cpi->tpl_stats[2].base_qindex; - int frame_gop_offset = 0; - - RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; - int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; - - memset(recon_frame_index, -1, sizeof(recon_frame_index)); - stack_init(arf_index_stack, MAX_ARF_LAYERS); - - // TODO(jingning): To be used later for gf frame type parsing. - (void)gf_group; - - for (i = 0; i < FRAME_BUFFERS; ++i) { - if (frame_bufs[i].ref_count == 0) { - alloc_frame_mvs(cm, i); - if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - - recon_frame_index[frame_idx] = i; - ++frame_idx; - - if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; - } - } - - for (i = 0; i < REFS_PER_FRAME + 1; ++i) { - assert(recon_frame_index[i] >= 0); - cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; - } - - *tpl_group_frames = 0; - - // Initialize Golden reference frame. - gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; - gf_picture[0].update_type = gf_group->update_type[0]; - gld_index = 0; - ++*tpl_group_frames; - - // Initialize base layer ARF frame - gf_picture[1].frame = cpi->Source; - gf_picture[1].ref_frame[0] = gld_index; - gf_picture[1].ref_frame[1] = lst_index; - gf_picture[1].ref_frame[2] = alt_index; - gf_picture[1].update_type = gf_group->update_type[1]; - alt_index = 1; - ++*tpl_group_frames; - - // Initialize P frames - for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - struct lookahead_entry *buf; - frame_gop_offset = gf_group->frame_gop_index[frame_idx]; - buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; - - switch (gf_group->update_type[frame_idx]) { - case ARF_UPDATE: - stack_push(arf_index_stack, alt_index, arf_stack_size); - ++arf_stack_size; - alt_index = frame_idx; - break; - case LF_UPDATE: lst_index = frame_idx; break; - case OVERLAY_UPDATE: - gld_index = frame_idx; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - case USE_BUF_FRAME: - lst_index = alt_index; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - default: break; - } - - ++*tpl_group_frames; - - // The length of group of pictures is baseline_gf_interval, plus the - // beginning golden frame from last GOP, plus the last overlay frame in - // the same GOP. - if (frame_idx == gf_group->gf_group_size) break; - } - - alt_index = -1; - ++frame_idx; - ++frame_gop_offset; - - // Extend two frames outside the current gf group. - for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { - struct lookahead_entry *buf = - vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = LF_UPDATE; - lst_index = frame_idx; - ++*tpl_group_frames; - ++extend_frame_count; - ++frame_gop_offset; - } -} - -static void init_tpl_stats(VP9_COMP *cpi) { - int frame_idx; - for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - memset(tpl_frame->tpl_stats_ptr, 0, - tpl_frame->height * tpl_frame->width * - sizeof(*tpl_frame->tpl_stats_ptr)); - tpl_frame->is_valid = 0; - } -} - -#if CONFIG_NON_GREEDY_MV -static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, - int frame_idx, uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, int mi_row, - int mi_col, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - int step_param; - uint32_t bestsme = UINT_MAX; - const MvLimits tmp_mv_limits = x->mv_limits; - // lambda is used to adjust the importance of motion vector consistency. - // TODO(angiebird): Figure out lambda's proper value. - const int lambda = cpi->tpl_stats[frame_idx].lambda; - int_mv nb_full_mvs[NB_MVS_NUM]; - int nb_full_mv_num; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - nb_full_mv_num = - vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); - vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, - lambda, 1, nb_full_mvs, nb_full_mv_num, mv); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - return bestsme; -} - -static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - - MV best_ref_mv1 = { 0, 0 }; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} - -#else // CONFIG_NON_GREEDY_MV -static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, - int stride, BLOCK_SIZE bsize, - MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = NSTEP; - int step_param; - int sadpb = x->sadperbit16; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - const MvLimits tmp_mv_limits = x->mv_limits; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, - search_method, sadpb, cond_cost_list(cpi, cost_list), - &best_ref_mv1, mv, 0, 0); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} -#endif - -static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, - int ref_pos_col, int block, BLOCK_SIZE bsize) { - int width = 0, height = 0; - int bw = 4 << b_width_log2_lookup[bsize]; - int bh = 4 << b_height_log2_lookup[bsize]; - - switch (block) { - case 0: - width = grid_pos_col + bw - ref_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 1: - width = ref_pos_col + bw - grid_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 2: - width = grid_pos_col + bw - ref_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - case 3: - width = ref_pos_col + bw - grid_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - default: assert(0); - } - - return width * height; -} - -static int round_floor(int ref_pos, int bsize_pix) { - int round; - if (ref_pos < 0) - round = -(1 + (-ref_pos - 1) / bsize_pix); - else - round = ref_pos / bsize_pix; - - return round; -} - -static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, - BLOCK_SIZE bsize, int stride) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; - const int64_t mc_flow = tpl_ptr->mc_flow; - const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; - *tpl_ptr = *src_stats; - tpl_ptr->mc_flow = mc_flow; - tpl_ptr->mc_ref_cost = mc_ref_cost; - tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; - } - } -} - -static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; - TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; - MV mv = tpl_stats->mv.as_mv; - int mv_row = mv.row >> 3; - int mv_col = mv.col >> 3; - - int ref_pos_row = mi_row * MI_SIZE + mv_row; - int ref_pos_col = mi_col * MI_SIZE + mv_col; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pix_num = bw * bh; - - // top-left on grid block location in pixel - int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; - int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; - int block; - - for (block = 0; block < 4; ++block) { - int grid_pos_row = grid_pos_row_base + bh * (block >> 1); - int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); - - if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && - grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { - int overlap_area = get_overlap_area( - grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); - int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; - int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; - - int64_t mc_flow = tpl_stats->mc_dep_cost - - (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / - tpl_stats->intra_cost; - - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *des_stats = - &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + - (ref_mi_col + idx)]; - - des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; - des_stats->mc_ref_cost += - ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / - pix_num; - assert(overlap_area >= 0); - } - } - } - } -} - -static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - int idx, idy; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = - &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; - tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, - BLOCK_8X8); - } - } -} - -static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblock_plane *const p = &x->plane[plane]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; - uint16_t eob; - int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; - const int shift = tx_size == TX_32X32 ? 0 : 2; - - // skip block condition should be handled before this is called. - assert(!x->skip_block); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, - scan_order->scan, scan_order->iscan); - } else { - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); - } -#else - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); -#endif // CONFIG_VP9_HIGHBITDEPTH - - *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; - *recon_error = VPXMAX(*recon_error, 1); - - *sse = (*sse) >> shift; - *sse = VPXMAX(*sse, 1); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. - switch (tx_size) { - case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - switch (tx_size) { - case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} - -static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, - int mi_col) { - x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.row_max = - (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); - x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.col_max = - ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); -} - -static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, - struct scale_factors *sf, GF_PICTURE *gf_picture, - int frame_idx, TplDepFrame *tpl_frame, - int16_t *src_diff, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, - int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, - YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int pix_num = bw * bh; - int best_rf_idx = -1; - int_mv best_mv; - int64_t best_inter_cost = INT64_MAX; - int64_t inter_cost; - int rf_idx; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; - - int64_t best_intra_cost = INT64_MAX; - int64_t intra_cost; - PREDICTION_MODE mode; - int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - MODE_INFO mi_above, mi_left; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - TplDepStats *tpl_stats = - &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; - xd->above_mi = (mi_row > 0) ? &mi_above : NULL; - xd->left_mi = (mi_col > 0) ? &mi_left : NULL; - - // Intra prediction search - for (mode = DC_PRED; mode <= TM_PRED; ++mode) { - uint8_t *src, *dst; - int src_stride, dst_stride; - - src = xd->cur_buf->y_buffer + mb_y_offset; - src_stride = xd->cur_buf->y_stride; - - dst = &predictor[0]; - dst_stride = bw; - - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - - vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, - src_stride, dst, dst_stride, 0, 0, 0); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); - } -#else - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); -#endif // CONFIG_VP9_HIGHBITDEPTH - - if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; - } - - // Motion compensated prediction - best_mv.as_int = 0; - - set_mv_limits(cm, x, mi_row, mi_col); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int_mv mv; -#if CONFIG_NON_GREEDY_MV - MotionField *motion_field; -#endif - if (ref_frame[rf_idx] == NULL) continue; - -#if CONFIG_NON_GREEDY_MV - (void)td; - motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); -#else - motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, bsize, &mv.as_mv); -#endif - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor( - CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), - ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd->bd); - vpx_highbd_subtract_block( - bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vp9_build_inter_predictor( - ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, - 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); - } -#else - vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); -#endif - - if (inter_cost < best_inter_cost) { - best_rf_idx = rf_idx; - best_inter_cost = inter_cost; - best_mv.as_int = mv.as_int; - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); - } - } - best_intra_cost = VPXMAX(best_intra_cost, 1); - best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); - tpl_stats->inter_cost = VPXMAX( - 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->intra_cost = VPXMAX( - 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; - tpl_stats->mv.as_int = best_mv.as_int; -} - -#if CONFIG_NON_GREEDY_MV -static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, - int frame_idx, int rf_idx, int mi_row, - int mi_col, struct buf_2d *src, - struct buf_2d *pre) { - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - YV12_BUFFER_CONFIG *ref_frame = NULL; - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - ref_frame = gf_picture[ref_frame_idx].frame; - src->buf = xd->cur_buf->y_buffer + mb_y_offset; - src->stride = xd->cur_buf->y_stride; - pre->buf = ref_frame->y_buffer + mb_y_offset; - pre->stride = ref_frame->y_stride; - assert(src->stride == pre->stride); - return 1; - } else { - printf("invalid ref_frame_idx"); - assert(ref_frame_idx != -1); - return 0; - } -} - -#define kMvPreCheckLines 5 -#define kMvPreCheckSize 15 - -#define MV_REF_POS_NUM 3 -POSITION mv_ref_pos[MV_REF_POS_NUM] = { - { -1, 0 }, - { 0, -1 }, - { -1, -1 }, -}; - -static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, - int mi_col) { - return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; -} - -static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - int i; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int_mv nearest_mv, near_mv, invalid_mv; - nearest_mv.as_int = INVALID_MV; - near_mv.as_int = INVALID_MV; - invalid_mv.as_int = INVALID_MV; - for (i = 0; i < MV_REF_POS_NUM; ++i) { - int nb_row = mi_row + mv_ref_pos[i].row * mi_height; - int nb_col = mi_col + mv_ref_pos[i].col * mi_width; - assert(mv_ref_pos[i].row <= 0); - assert(mv_ref_pos[i].col <= 0); - if (nb_row >= 0 && nb_col >= 0) { - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - } else { - int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - if (mv.as_int == nearest_mv.as_int) { - continue; - } else { - near_mv = mv; - break; - } - } - } - } - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv.as_mv.row = 0; - nearest_mv.as_mv.col = 0; - } - if (near_mv.as_int == INVALID_MV) { - near_mv.as_mv.row = 0; - near_mv.as_mv.col = 0; - } - if (mv_mode == NEAREST_MV_MODE) { - return nearest_mv; - } - if (mv_mode == NEAR_MV_MODE) { - return near_mv; - } - assert(0); - return invalid_mv; -} - -static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, - MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - int_mv mv; - switch (mv_mode) { - case ZERO_MV_MODE: - mv.as_mv.row = 0; - mv.as_mv.col = 0; - break; - case NEW_MV_MODE: - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - break; - case NEAREST_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - case NEAR_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - default: - mv.as_int = INVALID_MV; - assert(0); - break; - } - return mv; -} - -static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - uint32_t sse; - struct buf_2d src; - struct buf_2d pre; - MV full_mv; - *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, - mi_row, mi_col); - full_mv = get_full_mv(&mv->as_mv); - if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, - &src, &pre)) { - // TODO(angiebird): Consider subpixel when computing the sse. - cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), - pre.stride, &sse); - return (double)(sse << VP9_DIST_SCALE_LOG2); - } else { - assert(0); - return 0; - } -} - -static int get_mv_mode_cost(int mv_mode) { - // TODO(angiebird): The probabilities are roughly inferred from - // default_inter_mode_probs. Check if there is a better way to set the - // probabilities. - const int zero_mv_prob = 16; - const int new_mv_prob = 24 * 1; - const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; - assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); - switch (mv_mode) { - case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; - case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; - case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - default: assert(0); return -1; - } -} - -static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { - double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + - log2(1 + abs(new_mv->col - ref_mv->col)); - mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); - return mv_diff_cost; -} -static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, - int mi_col) { - double mv_cost = get_mv_mode_cost(mv_mode); - if (mv_mode == NEW_MV_MODE) { - MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, - tpl_frame, bsize, mi_row, mi_col) - .as_mv; - MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); - double near_cost = get_mv_diff_cost(&new_mv, &near_mv); - mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; - } - return mv_cost; -} - -static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - MACROBLOCKD *xd = &x->e_mbd; - double mv_dist = - get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); - double mv_cost = - get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); - double mult = 180; - - return mv_cost + mult * log2f(1 + mv_dist); -} - -static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - double *rd, int_mv *mv) { - int best_mv_mode = ZERO_MV_MODE; - int update = 0; - int mv_mode; - *rd = 0; - for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { - double this_rd; - int_mv this_mv; - if (mv_mode == NEW_MV_MODE) { - continue; - } - this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); - if (update == 0) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - update = 1; - } else { - if (this_rd < *rd) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - } - } - } - return best_mv_mode; -} - -static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int tmp_mv_mode_arr[kMvPreCheckSize]; - int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; - double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; - int_mv *select_mv_arr = cpi->select_mv_arr; - int_mv tmp_select_mv_arr[kMvPreCheckSize]; - int stride = tpl_frame->stride; - double new_mv_rd = 0; - double no_new_mv_rd = 0; - double this_new_mv_rd = 0; - double this_no_new_mv_rd = 0; - int idx; - int tmp_idx; - assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); - - // no new mv - // diagonal scan order - tmp_idx = 0; - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - if (r == 0 && c == 0) { - this_no_new_mv_rd = this_rd; - } - no_new_mv_rd += this_rd; - tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; - tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; - ++tmp_idx; - } - } - } - - // new mv - mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; - this_new_mv_rd = eval_mv_mode( - NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); - new_mv_rd = this_new_mv_rd; - // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE - // beforehand. - for (idx = 1; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - new_mv_rd += this_rd; - } - } - } - - // update best_mv_mode - tmp_idx = 0; - if (no_new_mv_rd < new_mv_rd) { - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; - select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; - ++tmp_idx; - } - } - } - rd_diff_arr[mi_row * stride + mi_col] = 0; - } else { - rd_diff_arr[mi_row * stride + mi_col] = - (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); - } -} - -static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int unit_rows = tpl_frame->mi_rows / mi_height; - const int unit_cols = tpl_frame->mi_cols / mi_width; - const int max_diagonal_lines = unit_rows + unit_cols - 1; - int idx; - for (idx = 0; idx < max_diagonal_lines; ++idx) { - int r; - for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); - ++r) { - int c = idx - r; - int mi_row = r * mi_height; - int mi_col = c * mi_width; - assert(c >= 0 && c < unit_cols); - assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); - assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); - predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col); - } - } -} - -static void do_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - assert(ref_frame != NULL); - set_mv_limits(cm, x, mi_row, mi_col); - { - int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; - uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; - const int stride = xd->cur_buf->y_stride; - full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, - ref_frame_buf, stride, bsize, mi_row, mi_col, - &mv.as_mv); - sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, - bsize, &mv.as_mv); - vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); - } -} - -static void build_motion_field( - VP9_COMP *cpi, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; - const int ph = num_4x4_blocks_high_lookup[bsize] << 2; - int mi_row, mi_col; - int rf_idx; - - tpl_frame->lambda = (pw * ph) >> 2; - assert(pw * ph == tpl_frame->lambda << 2); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - if (ref_frame[rf_idx] == NULL) { - continue; - } - vp9_motion_field_reset_mvs(motion_field); - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], - bsize, mi_row, mi_col); - } - } - } -} -#endif // CONFIG_NON_GREEDY_MV - -static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, - int frame_idx, BLOCK_SIZE bsize) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; - - VP9_COMMON *cm = &cpi->common; - struct scale_factors sf; - int rdmult, idx; - ThreadData *td = &cpi->td; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - int mi_row, mi_col; - -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); - uint8_t *predictor; -#else - DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); -#endif - DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); - - const TX_SIZE tx_size = max_txsize_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int64_t recon_error, sse; -#if CONFIG_NON_GREEDY_MV - int square_block_idx; - int rf_idx; -#endif - - // Setup scaling factor -#if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height, - cpi->common.use_highbitdepth); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - predictor = CONVERT_TO_BYTEPTR(predictor16); - else - predictor = predictor8; -#else - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // Prepare reference frame pointers. If any reference frame slot is - // unavailable, the pointer will be set to Null. - for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { - int rf_idx = gf_picture[frame_idx].ref_frame[idx]; - if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; - } - - xd->mi = cm->mi_grid_visible; - xd->mi[0] = cm->mi; - xd->cur_buf = this_frame; - - // Get rd multiplier set up. - rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); - set_error_per_bit(&cpi->td.mb, rdmult); - vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); - - tpl_frame->is_valid = 1; - - cm->base_qindex = tpl_frame->base_qindex; - vp9_frame_init_quantizer(cpi); - -#if CONFIG_NON_GREEDY_MV - for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; - ++square_block_idx) { - BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); - build_motion_field(cpi, frame_idx, ref_frame, square_bsize); - } - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize); - } - } -#endif - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, - src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &sse); - // Motion flow dependency dispenser. - tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, - tpl_frame->stride); - - tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, - bsize); - } - } -} - -#if CONFIG_NON_GREEDY_MV -#define DUMP_TPL_STATS 0 -#if DUMP_TPL_STATS -static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { - int i, j; - printf("%d %d\n", h, w); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - printf("%d ", buf[(row + i) * stride + col + j]); - } - } - printf("\n"); -} - -static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { - dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, - frame_buf->y_width); - dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); - dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); -} - -static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, - const GF_GROUP *gf_group, - const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { - int frame_idx; - const VP9_COMMON *cm = &cpi->common; - int rf_idx; - for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - int mi_row, mi_col; - int ref_frame_idx; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; - const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; - const int ref_gf_frame_offset = - gf_group->frame_gop_index[ref_frame_idx]; - printf("=\n"); - printf( - "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " - "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", - frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, - ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, - frame_idx, rf_idx, bsize, - mi_row, mi_col); - printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, - mv.as_mv.col); - } - } - } - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - const TplDepStats *tpl_ptr = - &tpl_frame - ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - printf("%f ", tpl_ptr->feature_score); - } - } - } - printf("\n"); - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - const int mv_mode = - tpl_frame - ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; - printf("%d ", mv_mode); - } - } - printf("\n"); - - dump_frame_buf(gf_picture[frame_idx].frame); - dump_frame_buf(ref_frame_buf); - } - } - } -} -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -static void init_tpl_buffer(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int frame; - - const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); -#if CONFIG_NON_GREEDY_MV - int rf_idx; - - vpx_free(cpi->select_mv_arr); - CHECK_MEM_ERROR( - cm, cpi->select_mv_arr, - vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); -#endif - - // TODO(jingning): Reduce the actual memory use for tpl model build up. - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { - if (cpi->tpl_stats[frame].width >= mi_cols && - cpi->tpl_stats[frame].height >= mi_rows && - cpi->tpl_stats[frame].tpl_stats_ptr) - continue; - -#if CONFIG_NON_GREEDY_MV - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, - vpx_calloc(mi_rows * mi_cols, - sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); - cpi->tpl_stats[frame].is_valid = 0; - cpi->tpl_stats[frame].width = mi_cols; - cpi->tpl_stats[frame].height = mi_rows; - cpi->tpl_stats[frame].stride = mi_cols; - cpi->tpl_stats[frame].mi_rows = cm->mi_rows; - cpi->tpl_stats[frame].mi_cols = cm->mi_cols; - } - - for (frame = 0; frame < REF_FRAMES; ++frame) { - cpi->enc_frame_buf[frame].mem_valid = 0; - cpi->enc_frame_buf[frame].released = 1; - } -} - -static void free_tpl_buffer(VP9_COMP *cpi) { - int frame; -#if CONFIG_NON_GREEDY_MV - vp9_free_motion_field_info(&cpi->motion_field_info); - vpx_free(cpi->select_mv_arr); -#endif - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { -#if CONFIG_NON_GREEDY_MV - int rf_idx; - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - cpi->tpl_stats[frame].is_valid = 0; - } -} - -#if CONFIG_RATE_CTRL -static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int show_frame_count = 0; - int frame_idx; - // Accumulate tpl stats for each frame in the current group of picture. - for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; - const int tpl_stride = tpl_frame->stride; - int64_t intra_cost_base = 0; - int64_t inter_cost_base = 0; - int64_t mc_dep_cost_base = 0; - int64_t mc_ref_cost_base = 0; - int64_t mc_flow_base = 0; - int row, col; - - if (!tpl_frame->is_valid) continue; - - for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { - for (col = 0; col < cm->mi_cols; ++col) { - TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; - intra_cost_base += this_stats->intra_cost; - inter_cost_base += this_stats->inter_cost; - mc_dep_cost_base += this_stats->mc_dep_cost; - mc_ref_cost_base += this_stats->mc_ref_cost; - mc_flow_base += this_stats->mc_flow; - } - } - - cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; - cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; - - ++show_frame_count; - } -} -#endif // CONFIG_RATE_CTRL - -static void setup_tpl_stats(VP9_COMP *cpi) { - GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int tpl_group_frames = 0; - int frame_idx; - cpi->tpl_bsize = BLOCK_32X32; - - init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); - - init_tpl_stats(cpi); - - // Backward propagation from tpl_group_frames to 1. - for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { - if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; - mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); - } -#if CONFIG_NON_GREEDY_MV - cpi->tpl_ready = 1; -#if DUMP_TPL_STATS - dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - accumulate_frame_tpl_stats(cpi); - } -#endif // CONFIG_RATE_CTRL -} - void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int *ref_frame_coding_indexes, @@ -7663,6 +6373,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, const int gf_group_index = cpi->twopass.gf_group.index; int i; +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time); +#endif + if (is_one_pass_svc(cpi)) { vp9_one_pass_svc_start_layer(cpi); } @@ -7727,9 +6441,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1); not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_temporal_filter_time); +#endif // Produce the filtered ARF frame. vp9_temporal_filter(cpi, arf_src_index); vpx_extend_frame_borders(&cpi->alt_ref_buffer); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_temporal_filter_time); +#endif // for small bitrates segmentation overhead usually // eats all bitrate gain from enabling delta quantizers @@ -7843,7 +6563,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #if !CONFIG_REALTIME_ONLY if ((oxcf->pass == 2) && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif vp9_rc_get_second_pass_params(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif } else if (oxcf->pass == 1) { set_frame_size(cpi); } @@ -7864,7 +6590,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, pthread_mutex_init(&cpi->kmeans_mutex, NULL); #endif CHECK_MEM_ERROR( - cm, cpi->kmeans_data_arr, + &cm->error, cpi->kmeans_data_arr, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr))); cpi->kmeans_data_stride = mi_cols; cpi->kmeans_data_arr_alloc = 1; @@ -7883,13 +6609,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_NON_GREEDY_MV +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, setup_tpl_stats_time); +#endif if (gf_group_index == 1 && cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { - init_tpl_buffer(cpi); + vp9_init_tpl_buffer(cpi); vp9_estimate_qp_gop(cpi); - setup_tpl_stats(cpi); + vp9_setup_tpl_stats(cpi); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, setup_tpl_stats_time); +#endif #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads == 0 && @@ -7926,8 +6658,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case. + start_timing(cpi, Pass2Encode_time); +#endif Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result); vp9_twopass_postencode_update(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, Pass2Encode_time); +#endif } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); } else { @@ -8130,6 +6869,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + // if (cpi->frame_component_time[0] > 100) + if (oxcf->pass == 2) { + uint64_t frame_total = 0, total = 0; + int i; + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n", + cm->current_video_frame, get_frame_type_enum(cm->frame_type), + cm->show_frame, cm->base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use vp9_get_compressed_data_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; @@ -8172,12 +6946,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } } -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { VP9_COMMON *cm = &cpi->common; int hr = 0, hs = 0, vr = 0, vs = 0; - if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1; Scale2Ratio(horiz_mode, &hr, &hs); Scale2Ratio(vert_mode, &vr, &vs); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index cca8b53f8..7136f7faa 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -14,9 +14,11 @@ #include <stdio.h> #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_tpl.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif @@ -91,13 +93,6 @@ typedef enum { } ENCODE_BREAKOUT_TYPE; typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - -typedef enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. GOOD, @@ -336,15 +331,14 @@ typedef struct TplDepFrame { typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; // Used for adaptive_rd_thresh with row multithreading int *row_base_thresh_freq_fact; + MV firstpass_top_mv; } TileDataEnc; typedef struct RowMTInfo { @@ -513,6 +507,7 @@ typedef struct EncFrameBuf { } EncFrameBuf; // Maximum operating frame buffer size needed for a GOP using ARF reference. +// This is used to allocate the memory for TPL stats for a GOP. #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) #define MAX_KMEANS_GROUPS 8 @@ -659,6 +654,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "vpx_ports/vpx_timer.h" +// Adjust the following to add new components. +typedef enum { + vp9_get_compressed_data_time, + vp9_temporal_filter_time, + vp9_rc_get_second_pass_params_time, + setup_tpl_stats_time, + Pass2Encode_time, + + encode_with_recode_loop_time, + loopfilter_frame_time, + vp9_pack_bitstream_time, + + encode_frame_internal_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + encode_sb_time, + + vp9_rd_pick_inter_mode_sb_time, + vp9_rd_pick_inter_mode_sub8x8_time, + + intra_mode_search_time, + handle_inter_mode_time, + single_motion_search_time, + joint_motion_search_time, + interp_filter_time, + + kTimingComponents, +} TIMING_COMPONENT; + +static INLINE char const *get_component_name(int index) { + switch (index) { + case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time"; + case vp9_temporal_filter_time: return "vp9_temporal_filter_time"; + case vp9_rc_get_second_pass_params_time: + return "vp9_rc_get_second_pass_params_time"; + case setup_tpl_stats_time: return "setup_tpl_stats_time"; + case Pass2Encode_time: return "Pass2Encode_time"; + + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loopfilter_frame_time: return "loopfilter_frame_time"; + case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time"; + + case encode_frame_internal_time: return "encode_frame_internal_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case encode_sb_time: return "encode_sb_time"; + + case vp9_rd_pick_inter_mode_sb_time: + return "vp9_rd_pick_inter_mode_sb_time"; + case vp9_rd_pick_inter_mode_sub8x8_time: + return "vp9_rd_pick_inter_mode_sub8x8_time"; + + case intra_mode_search_time: return "intra_mode_search_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case single_motion_search_time: return "single_motion_search_time"; + case joint_motion_search_time: return "joint_motion_search_time"; + case interp_filter_time: return "interp_filter_time"; + + default: assert(0); + } + return "error"; +} +#endif + typedef struct VP9_COMP { FRAME_INFO frame_info; QUANTS quants; @@ -685,6 +746,8 @@ typedef struct VP9_COMP { BLOCK_SIZE tpl_bsize; TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + // Used to store TPL stats before propagation + VpxTplGopStats tpl_gop_stats; YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; EncFrameBuf enc_frame_buf[REF_FRAMES]; #if CONFIG_MULTITHREAD @@ -784,7 +847,7 @@ typedef struct VP9_COMP { uint8_t *skin_map; - // segment threashold for encode breakout + // segment threshold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; CYCLIC_REFRESH *cyclic_refresh; @@ -858,12 +921,15 @@ typedef struct VP9_COMP { // number of MBs in the current frame when the frame is // scaled. + int last_coded_width; + int last_coded_height; + int use_svc; SVC svc; // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. - diff *source_diff_var; + Diff *source_diff_var; // The threshold used in SOURCE_VAR_BASED_PARTITION search type. unsigned int source_var_thresh; int frames_till_next_var_check; @@ -973,6 +1039,29 @@ typedef struct VP9_COMP { EXT_RATECTRL ext_ratectrl; int fixed_qp_onepass; + + // Flag to keep track of dynamic change in deadline mode + // (good/best/realtime). + MODE deadline_mode_previous_frame; + + // Flag to disable scene detection when rtc rate control library is used. + int disable_scene_detection_rtc_ratectrl; + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct vpx_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif } VP9_COMP; #if CONFIG_RATE_CTRL @@ -983,7 +1072,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->partition_info, + CHECK_MEM_ERROR(&cm->error, cpi->partition_info, (PARTITION_INFO *)vpx_calloc(unit_width * unit_height, sizeof(PARTITION_INFO))); memset(cpi->partition_info, 0, @@ -998,8 +1087,8 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) { } static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) { - mv_info->ref_frame[0] = NONE; - mv_info->ref_frame[1] = NONE; + mv_info->ref_frame[0] = NO_REF_FRAME; + mv_info->ref_frame[1] = NO_REF_FRAME; mv_info->mv[0].as_int = INVALID_MV; mv_info->mv[1].as_int = INVALID_MV; } @@ -1011,7 +1100,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->motion_vector_info, + CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); memset(cpi->motion_vector_info, 0, @@ -1030,7 +1119,7 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) { static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; CHECK_MEM_ERROR( - cm, cpi->tpl_stats_info, + &cm->error, cpi->tpl_stats_info, (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats))); memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats)); } @@ -1049,7 +1138,7 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info, + CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); } @@ -1154,8 +1243,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols); -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height); @@ -1296,6 +1385,14 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd); +#else +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, @@ -1380,9 +1477,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width, VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); -int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[8], int delta_lf[8], - int skip[8], int ref_frame[8]); +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]); void vp9_new_framerate(VP9_COMP *cpi, double framerate); @@ -1392,6 +1490,171 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + vpx_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv->row >= -MV_MAX && mv->row < MV_MAX); + assert(mv->col >= -MV_MAX && mv->col < MV_MAX); + return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, + const MV *ref, int sad_per_bit) { + MV diff; + diff.row = mv->row - ref->row; + diff.col = mv->col - ref->col; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full, + const MV *ref_mv_full, + vpx_sad_fn_t sad_fn_ptr, int sadpb) { + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + uint32_t start_mv_sad = + sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb); + + return start_mv_sad; +} + +static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, + int subsampling_dim, int blk_dim) { + return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; +} + +// Compute the sum of squares on all visible 4x4s in the transform block. +static int64_t sum_squares_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const int16_t *diff, const int diff_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + int *visible_width, int *visible_height) { + int64_t sse; + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + const int b4x4s_to_right_edge = num_4x4_to_edge( + plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col); + const int b4x4s_to_bottom_edge = num_4x4_to_edge( + plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row); + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + assert(tx_4x4_w == tx_4x4_h); + sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); + *visible_width = tx_4x4_w << 2; + *visible_height = tx_4x4_h << 2; + } else { + int r, c; + const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + sse = 0; + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + sse += (int64_t)vpx_sum_squares_2d_i16( + diff + r * diff_stride * 4 + c * 4, diff_stride, 4); + } + } + *visible_width = max_c << 2; + *visible_height = max_r << 2; + } + return sse; +} + +// Check if trellis coefficient optimization of the transform block is enabled. +static INLINE int do_trellis_opt(const struct macroblockd_plane *pd, + const int16_t *src_diff, int diff_stride, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + const struct encode_b_args *const args = (struct encode_b_args *)arg; + const MACROBLOCK *const x = args->x; + + switch (args->enable_trellis_opt) { + case DISABLE_TRELLIS_OPT: return 0; + case ENABLE_TRELLIS_OPT: return 1; + case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: { + vpx_clear_system_state(); + + return (args->trellis_opt_thresh > 0.0) + ? (x->log_block_src_var <= args->trellis_opt_thresh) + : 1; + } + case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; +#if CONFIG_VP9_HIGHBITDEPTH + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; +#else + const int dequant_shift = 3; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int qstep = pd->dequant[1] >> dequant_shift; + int *sse_calc_done = args->sse_calc_done; + int64_t *sse = args->sse; + int visible_width = 0, visible_height = 0; + + // TODO: Enable the sf for high bit-depth case + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse || + !sse_calc_done) + return 1; + + *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row, + blk_col, plane_bsize, tx_bsize, &visible_width, + &visible_height); + *sse_calc_done = 1; + + vpx_clear_system_state(); + + return (*(sse) <= (int64_t)visible_width * visible_height * qstep * + qstep * args->trellis_opt_thresh); + } + default: assert(0 && "Invalid trellis optimization method."); return 1; + } +} + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + vpx_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 453fe2e0d..681996d33 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -94,10 +94,10 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { vp9_bitstream_encode_tiles_buffer_dealloc(cpi); vp9_encode_free_mt_data(cpi); - CHECK_MEM_ERROR(cm, cpi->workers, + CHECK_MEM_ERROR(&cm->error, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data, vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); for (i = 0; i < num_workers; i++) { @@ -111,7 +111,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { thread_data->cpi = cpi; // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, + CHECK_MEM_ERROR(&cm->error, thread_data->td, vpx_memalign(32, sizeof(*thread_data->td))); vp9_zero(*thread_data->td); @@ -121,7 +121,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { vp9_setup_pc_tree(cm, thread_data->td); // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, + CHECK_MEM_ERROR(&cm->error, thread_data->td->counts, vpx_calloc(1, sizeof(*thread_data->td->counts))); // Create threads @@ -265,6 +265,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data, tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high; tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count; tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; @@ -292,7 +293,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, { int i; - CHECK_MEM_ERROR(cm, row_mt_sync->mutex, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex, vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); if (row_mt_sync->mutex) { for (i = 0; i < rows; ++i) { @@ -300,7 +301,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, } } - CHECK_MEM_ERROR(cm, row_mt_sync->cond, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond, vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); if (row_mt_sync->cond) { for (i = 0; i < rows; ++i) { @@ -310,7 +311,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, } #endif // CONFIG_MULTITHREAD - CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col, vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); // Set up nsync. diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 1d440442b..4664e8c5e 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -8,10 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stddef.h> + #include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_common.h" #include "vpx_dsp/psnr.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { if (ext_ratectrl == NULL) { @@ -92,6 +97,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, rc_frame_stats->mv_in_out_count = stats->mv_in_out_count; rc_frame_stats->duration = stats->duration; rc_frame_stats->count = stats->count; + rc_frame_stats->new_mv_count = stats->new_mv_count; } vpx_codec_err_t vp9_extrc_send_firstpass_stats( @@ -118,6 +124,21 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats( return VPX_CODEC_OK; } +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) { + vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats( + ext_ratectrl->model, tpl_gop_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { // TODO(angiebird): Add unit test to make sure this function behaves like // get_frame_type_from_update_type() @@ -131,7 +152,6 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { default: fprintf(stderr, "Unsupported update_type %d\n", update_type); abort(); - return 1; } } diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 7c3875883..b04580c1d 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -12,6 +12,7 @@ #define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" #include "vp9/encoder/vp9_firstpass.h" typedef struct EXT_RATECTRL { @@ -34,6 +35,9 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); vpx_codec_err_t vp9_extrc_send_firstpass_stats( EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats); + vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e9250e25c..a9cdf5353 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -152,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->pcnt_intra_high = 0.0; section->inactive_zone_rows = 0.0; section->inactive_zone_cols = 0.0; + section->new_mv_count = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; @@ -183,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->pcnt_intra_high += frame->pcnt_intra_high; section->inactive_zone_rows += frame->inactive_zone_rows; section->inactive_zone_cols += frame->inactive_zone_cols; + section->new_mv_count += frame->new_mv_count; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; @@ -212,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->pcnt_intra_high -= frame->pcnt_intra_high; section->inactive_zone_rows -= frame->inactive_zone_rows; section->inactive_zone_cols -= frame->inactive_zone_cols; + section->new_mv_count -= frame->new_mv_count; section->MVr -= frame->MVr; section->mvr_abs -= frame->mvr_abs; section->MVc -= frame->MVc; @@ -361,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_8_mse8x16; default: return vpx_highbd_8_mse16x16; } - break; case 10: switch (bsize) { case BLOCK_8X8: return vpx_highbd_10_mse8x8; @@ -369,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_10_mse8x16; default: return vpx_highbd_10_mse16x16; } - break; case 12: switch (bsize) { case BLOCK_8X8: return vpx_highbd_12_mse8x8; @@ -377,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_12_mse8x16; default: return vpx_highbd_12_mse16x16; } - break; } } @@ -435,6 +435,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = xd->mi[0]->sb_type; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + MV center_mv_full = ref_mv_full; + unsigned int start_mv_sad; + vp9_sad_fn_ptr_t sad_fn_ptr; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -455,10 +458,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } #endif // CONFIG_VP9_HIGHBITDEPTH + // Calculate SAD of the start mv + clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, + cpi->fn_ptr[bsize].sdf, x->sadperbit16); + sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf; + sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df; + // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, + &tmp_mv, step_param, x->sadperbit16, &num00, + &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; @@ -478,9 +489,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param + n, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) @@ -595,11 +606,11 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { #define FP_MAX_DN_THRESH 24 #define KERNEL_SIZE 3 -// Baseline Kernal weights for first pass noise metric -static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, +// Baseline Kernel weights for first pass noise metric +static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -// Estimate noise at a single point based on the impace of a spatial kernal +// Estimate noise at a single point based on the impact of a spatial kernel // on the point value static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int sum_weight = 0; @@ -609,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int diff; int dn_diff; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint8_t dn_val; uint8_t centre_val = *src_ptr; - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { for (j = 0; j < KERNEL_SIZE; ++j) { diff = abs((int)centre_val - (int)tmp_ptr[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -651,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { int dn_diff; uint8_t *tmp_ptr; uint16_t *tmp_ptr16; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint16_t dn_val; uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr); - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr); @@ -665,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { diff = abs((int)centre_val - (int)tmp_ptr16[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -793,6 +804,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, fps->inactive_zone_cols = (double)0; if (fp_acc_data->mvcount > 0) { + fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs; fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; @@ -809,6 +821,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; } else { + fps->new_mv_count = 0.0; fps->MVr = 0.0; fps->mvr_abs = 0.0; fps->MVc = 0.0; @@ -834,6 +847,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low; this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high; this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count; this_tile->fp_data.mvcount += fp_acc_data->mvcount; this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; @@ -904,6 +918,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, double mb_neutral_count; int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); + MV *first_top_mv = &tile_data->firstpass_top_mv; + MV last_nonzero_mv = { 0, 0 }; + // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); @@ -944,6 +961,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); + if (mb_col == mb_col_start) { + last_nonzero_mv = *first_top_mv; + } + // Adjust to the next column of MBs. x->plane[0].src.buf = cpi->Source->y_buffer + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; @@ -1253,7 +1274,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, xd->mi[0]->mv[0].as_mv = mv; xd->mi[0]->tx_size = TX_4X4; xd->mi[0]->ref_frame[0] = LAST_FRAME; - xd->mi[0]->ref_frame[1] = NONE; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); vp9_encode_sby_pass1(x, bsize); fp_acc_data->sum_mvr += mv.row; @@ -1268,6 +1289,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (!is_zero_mv(&mv)) { ++(fp_acc_data->mvcount); + if (!is_equal_mv(&mv, &last_nonzero_mv)) { + ++(fp_acc_data->new_mv_count); + } + last_nonzero_mv = mv; // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { @@ -1323,6 +1348,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } fp_acc_data->coded_error += (int64_t)this_error; + if (mb_col == mb_col_start) { + *first_top_mv = last_nonzero_mv; + } recon_yoffset += 16; recon_uvoffset += uv_mb_height; @@ -1345,7 +1373,7 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { MV best_ref_mv; // Tiling is ignored in the first pass. vp9_tile_init(tile, cm, 0, 0); - + tile_data.firstpass_top_mv = zero_mv; #if CONFIG_RATE_CTRL if (cpi->oxcf.use_simple_encode_api) { fp_motion_vector_info_reset(cpi->frame_info.frame_width, @@ -1411,7 +1439,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) CHECK_MEM_ERROR( - cm, cpi->twopass.fp_mb_float_stats, + &cm->error, cpi->twopass.fp_mb_float_stats, vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); { @@ -1437,7 +1465,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } - // Dont allow a value of 0 for duration. + // Don't allow a value of 0 for duration. // (Section duration is also defaulted to minimum of 1.0). fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start)); @@ -1447,7 +1475,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { accumulate_stats(&twopass->total_stats, &fps); } - // Copy the previous Last Frame back into gf and and arf buffers if + // Copy the previous Last Frame back into gf and arf buffers if // the prediction is good enough... but also don't allow it to lag too far. if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && @@ -1476,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { cm->ref_frame_map[cpi->lst_fb_idx]); } - // Use this to see what the first pass reconstruction looks like. - if (0) { - char filename[512]; - FILE *recon_file; - snprintf(filename, sizeof(filename), "enc%04d.yuv", - (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) - recon_file = fopen(filename, "wb"); - else - recon_file = fopen(filename, "ab"); - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - // In the first pass, every frame is considered as a show frame. update_frame_indexes(cm, /*show_frame=*/1); if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); @@ -1664,7 +1676,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // Scan the first pass file and calculate a modified score for each // frame that is used to distribute bits. The modified score is assumed - // to provide a linear basis for bit allocation. I.e a frame A with a score + // to provide a linear basis for bit allocation. I.e., a frame A with a score // that is double that of frame B will be allocated 2x as many bits. { double modified_score_total = 0.0; @@ -1689,8 +1701,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) { } // Second scan using clamps based on the previous cycle average. - // This may modify the total and average somewhat but we dont bother with - // further itterations. + // This may modify the total and average somewhat but we don't bother with + // further iterations. modified_score_total = 0.0; s = twopass->stats_in; while (s < twopass->stats_in_end) { @@ -1847,7 +1859,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) { // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // useage or a second ref coded error notabley lower than the last + // usage or a second ref coded error notabley lower than the last // frame coded error. if (frame_stats == NULL) { return 0; @@ -2027,7 +2039,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // We want to discount the the flash frame itself and the recovery + // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. flash_detected = detect_flash_from_frame_stats(this_frame) || detect_flash_from_frame_stats(next_frame); @@ -2158,7 +2170,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score, double score_total = 0.0; int i = 0; - // We dont ever want to return a 0 score here. + // We don't ever want to return a 0 score here. if (frame_count == 0) return 1.0; while ((i < frame_count) && (s < twopass->stats_in_end)) { @@ -2492,7 +2504,7 @@ static int get_gop_coding_frame_num( int *use_alt_ref, const FRAME_INFO *frame_info, const TWO_PASS *const twopass, const RATE_CONTROL *rc, int gf_start_show_idx, const RANGE *active_gf_interval, - double gop_intra_factor, int lag_in_frames) { + double gop_intra_factor, int lag_in_frames, int *end_of_sequence) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; double loop_decay_rate = 1.00; double mv_ratio_accumulator = 0.0; @@ -2518,6 +2530,7 @@ static int get_gop_coding_frame_num( next_frame = fps_get_frame_stats(first_pass_info, gf_start_show_idx + gop_coding_frames); if (next_frame == NULL) { + *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active; break; } @@ -2586,7 +2599,7 @@ static int get_gop_coding_frame_num( if ( // Don't break out with a very short interval. (gop_coding_frames >= active_gf_interval->min) && - // If possible dont break very close to a kf + // If possible don't break very close to a kf ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) && (gop_coding_frames & 0x01) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || @@ -2708,6 +2721,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { double gop_intra_factor; int gop_frames; RANGE active_gf_interval; + // Whether this is at the end of last GOP of this sequence. + int end_of_sequence = 0; // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. @@ -2739,7 +2754,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { gop_coding_frames = get_gop_coding_frame_num( &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, - &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); + &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames, + &end_of_sequence); use_alt_ref &= allow_alt_ref; #if CONFIG_RATE_CTRL // If the external gop_command is on, we will override the decisions @@ -2757,7 +2773,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| // will be overwritten. if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) { vpx_codec_err_t codec_status; vpx_rc_gop_decision_t gop_decision; vpx_rc_gop_info_t gop_info; @@ -3020,7 +3037,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame, next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error); // Return true the intra/inter ratio for the current frame is - // low but better in the next and previous frame and the relative useage of + // low but better in the next and previous frame and the relative usage of // intra in the current frame is markedly higher than the last and next frame. if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) && (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) && @@ -3041,8 +3058,8 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame, // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 // Threshold for use of the lagging second reference frame. Scene cuts do not -// usually have a high second ref useage. -#define SECOND_REF_USEAGE_THRESH 0.2 +// usually have a high second ref usage. +#define SECOND_REF_USAGE_THRESH 0.2 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. @@ -3072,7 +3089,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, detect_flash_from_frame_stats(next_frame); if (!detect_flash_from_frame_stats(this_frame) && !detect_flash_from_frame_stats(next_frame) && - (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || (slide_transition(this_frame, last_frame, next_frame)) || (intra_step_transition(this_frame, last_frame, next_frame)) || @@ -3350,7 +3367,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // The second (lagging) ref error is not valid immediately after // a key frame because either the lag has not built up (in the case of - // the first key frame or it points to a refernce before the new key + // the first key frame or it points to a reference before the new key // frame. if (i < 2) sr_accumulator = 0.0; frame_boost = @@ -3380,7 +3397,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { twopass->key_frame_section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Special case for static / slide show content but dont apply + // Special case for static / slide show content but don't apply // if the kf group is very short. if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { rc->kf_boost = (int)(twopass->kf_max_total_boost); @@ -3494,8 +3511,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; const int show_idx = cm->current_video_frame; - if (cpi->common.current_frame_coding_index == 0) { - VP9_COMMON *cm = &cpi->common; + if (cpi->common.current_frame_coding_index == 0 && + cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) { const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); if (codec_status != VPX_CODEC_OK) { @@ -3513,7 +3530,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { vp9_init_vizier_params(twopass, screen_area); } - // If this is an arf frame then we dont want to read the stats file or + // If this is an arf frame then we don't want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { int target_rate; @@ -3792,6 +3809,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame; RANGE active_gf_interval; int arf_layers; + int end_of_sequence = 0; if (oxcf->use_simple_encode_api) { active_gf_interval = get_active_gf_inverval_range_simple( rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); @@ -3809,9 +3827,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, gop_intra_factor = 1.0; } - frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc, - show_idx, &active_gf_interval, - gop_intra_factor, oxcf->lag_in_frames); + frame_count = get_gop_coding_frame_num( + use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval, + gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence); *use_alt_ref &= allow_alt_ref; return frame_count; } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index cdcf56872..a19b04db7 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -14,6 +14,7 @@ #include <assert.h> #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/encoder/vp9_firstpass_stats.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -55,37 +56,9 @@ typedef struct { int64_t sum_mvcs; int sum_in_vectors; int intra_smooth_count; + int new_mv_count; } FIRSTPASS_DATA; -typedef struct { - double frame; - double weight; - double intra_error; - double coded_error; - double sr_coded_error; - double frame_noise_energy; - double pcnt_inter; - double pcnt_motion; - double pcnt_second_ref; - double pcnt_neutral; - double pcnt_intra_low; // Coded intra but low variance - double pcnt_intra_high; // Coded intra high variance - double intra_skip_pct; - double intra_smooth_pct; // % of blocks that are smooth - double inactive_zone_rows; // Image mask rows top and bottom. - double inactive_zone_cols; // Image mask columns at left and right edges. - double MVr; - double mvr_abs; - double MVc; - double mvc_abs; - double MVrv; - double MVcv; - double mv_in_out_count; - double duration; - double count; - int64_t spatial_layer_id; -} FIRSTPASS_STATS; - typedef enum { KF_UPDATE = 0, LF_UPDATE = 1, diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h new file mode 100644 index 000000000..01928e781 --- /dev/null +++ b/vp9/encoder/vp9_firstpass_stats.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double frame_noise_energy; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double pcnt_intra_low; // Coded intra but low variance + double pcnt_intra_high; // Coded intra high variance + double intra_skip_pct; + double intra_smooth_pct; // % of blocks that are smooth + double inactive_zone_rows; // Image mask rows top and bottom. + double inactive_zone_cols; // Image mask columns at left and right edges. + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; + double new_mv_count; + int64_t spatial_layer_id; +} FIRSTPASS_STATS; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index a410d0407..ba550a1d6 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -12,6 +12,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_scale/yv12config.h" @@ -91,6 +92,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, { const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires + // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it + // needs to call vp9_scale_and_extend_frame_nonnormative() that supports + // arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + if (x_step_q4 > 64 || y_step_q4 > 64) { + // This function is only called while cm->bit_depth is VPX_BITS_8. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8); +#else + vp9_scale_and_extend_frame_nonnormative(src, dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + return; + } + for (i = 0; i < MAX_MB_PLANE; ++i) { const int factor = (i == 0 || i == 3 ? 1 : 2); const int src_stride = src_strides[i]; diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 7c2790cb9..2f20a8fe6 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. if (ref_mv->row != 0 || ref_mv->col != 0) { - unsigned int tmp_err; - MV zero_ref_mv = { 0, 0 }, tmp_mv; + MV zero_ref_mv = { 0, 0 }; tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); @@ -238,7 +237,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->mi[0] = &mi_local; mi_local.sb_type = BLOCK_16X16; mi_local.ref_frame[0] = LAST_FRAME; - mi_local.ref_frame[1] = NONE; + mi_local.ref_frame[1] = NO_REF_FRAME; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { MV gld_left_mv = gld_top_mv; @@ -289,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { int *arf_not_zz; CHECK_MEM_ERROR( - cm, arf_not_zz, + &cm->error, arf_not_zz, vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); // We are not interested in results beyond the alt ref itself. @@ -334,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) { } } - // Only bother with segmentation if over 10% of the MBs in static segment - // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) ) - if (1) { - // Note % of blocks that are marked as static - if (cm->MBs) - cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); + // Note % of blocks that are marked as static + if (cm->MBs) + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); - // This error case should not be reachable as this function should - // never be called with the common data structure uninitialized. - else - cpi->static_mb_pct = 0; - - vp9_enable_segmentation(&cm->seg); - } else { + // This error case should not be reachable as this function should + // never be called with the common data structure uninitialized. + else cpi->static_mb_pct = 0; - vp9_disable_segmentation(&cm->seg); - } + + vp9_enable_segmentation(&cm->seg); // Free localy allocated storage vpx_free(arf_not_zz); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 1f08aa5de..cbe1c4029 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -77,14 +77,6 @@ int vp9_init_search_range(int size) { return sr; } -static INLINE int mv_cost(const MV *mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv->row >= -MV_MAX && mv->row < MV_MAX); - assert(mv->col >= -MV_MAX && mv->col < MV_MAX); - return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + - comp_cost[1][mv->col]; -} - int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int weight) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; @@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, } return 0; } - -static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, - int sad_per_bit) { - const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len; int ss_count = 0; @@ -163,8 +146,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) { \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ @@ -173,7 +156,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { src_stride, &sse, second_pred); \ } \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -192,15 +176,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #define CHECK_BETTER(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ else \ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse, second_pred); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -312,7 +297,7 @@ static unsigned int setup_center_error( besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } @@ -327,7 +312,7 @@ static unsigned int setup_center_error( uint32_t besterr; (void)xd; if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { @@ -650,7 +635,7 @@ static int accurate_sub_pel_search( vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, 0, kernel, MV_PRECISION_Q3, 0, 0); if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); } else { @@ -669,7 +654,7 @@ static int accurate_sub_pel_search( vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, 0, kernel, MV_PRECISION_Q3, 0, 0); if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); } else { @@ -686,13 +671,14 @@ static int accurate_sub_pel_search( do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -711,12 +697,13 @@ static int accurate_sub_pel_search( #define CHECK_BETTER1(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -966,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) { } #define MAX_PATTERN_SCALES 11 -#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates // Calculate and return a sad+mvcost list around an integer best pel. @@ -980,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; int br = best_mv->row; int bc = best_mv->col; - MV this_mv; + const MV mv = { br, bc }; int i; unsigned int sse; - this_mv.row = br; - this_mv.col = bc; cost_list[0] = - fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), in_what->stride, &sse) + - mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + mvsad_err_cost(x, &mv, &fcenter_mv, sadpb); if (check_bounds(&x->mv_limits, br, bc, 1)) { for (i = 0; i < 4; i++) { const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; @@ -1049,7 +1034,7 @@ static int vp9_pattern_search( in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { @@ -1170,6 +1155,9 @@ static int vp9_pattern_search( } while (s--); } + best_mv->row = br; + best_mv->col = bc; + // Returns the one-away integer pel sad values around the best as follows: // cost_list[0]: cost at the best integer pel // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel @@ -1177,11 +1165,8 @@ static int vp9_pattern_search( // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel if (cost_list) { - const MV best_mv = { br, bc }; - calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list); + calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list); } - best_mv->row = br; - best_mv->col = bc; return bestsad; } @@ -1223,7 +1208,7 @@ static int vp9_pattern_search_sad( in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { @@ -2068,9 +2053,9 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, #endif // CONFIG_NON_GREEDY_MV int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { int i, j, step; @@ -2081,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int in_what_stride = xd->plane[0].pre[0].stride; const uint8_t *best_address; - unsigned int bestsad = INT_MAX; + unsigned int bestsad = start_mv_sad; int best_site = -1; int last_site = -1; @@ -2099,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int tot_steps = cfg->total_steps - search_param; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, - x->mv_limits.row_min, x->mv_limits.row_max); ref_row = ref_mv->row; ref_col = ref_mv->col; *num00 = 0; @@ -2111,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; best_address = in_what; - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + - mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); - i = 0; for (step = 0; step < tot_steps; step++) { @@ -2138,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, - sad_array); + sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); for (t = 0; t < 4; t++, i++) { if (sad_array[t] < bestsad) { @@ -2163,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, if (is_mv_in(&x->mv_limits, &this_mv)) { const uint8_t *const check_here = ss_os[i] + best_address; unsigned int thissad = - fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); @@ -2321,17 +2300,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, // TODO(jingning): Implement integral projection functions for high bit-depth // setting and remove this part of code. if (xd->bd != 8) { - unsigned int this_sad; + const unsigned int sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); tmp_mv->row = 0; tmp_mv->col = 0; - this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } - return this_sad; + return sad; } #endif @@ -2506,15 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, point as the best match, we will do a final 1-away diamond refining search */ static int full_pixel_diamond(const VP9_COMP *const cpi, - const MACROBLOCK *const x, MV *mvp_full, - int step_param, int sadpb, int further_steps, - int do_refine, int *cost_list, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + int use_downsampled_sad, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + int bestsme; + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + uint8_t *pred_buf; + vp9_sad_fn_ptr_t sad_fn_ptr; + unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows; + const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + + pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + start_mv_sad_even_rows = + fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad_odd_rows = + fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride, + pred_buf + pred_buf_stride, pred_buf_stride); + start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1; + start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb); + + sad_fn_ptr.sdf = fn_ptr->sdf; + sad_fn_ptr.sdx4df = fn_ptr->sdx4df; + if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) { + // If the absolute difference between the pred-to-src SAD of even rows and + // the pred-to-src SAD of odd rows is small, skip every other row in sad + // computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 10; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + sad_fn_ptr.sdf = fn_ptr->sdsf; + sad_fn_ptr.sdx4df = fn_ptr->sdsx4df; + } + } + + bestsme = + cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, + step_param, sadpb, &n, &sad_fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -2529,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param + n, sadpb, &num00, fn_ptr, - ref_mv); + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, + &temp_mv, step_param + n, sadpb, &num00, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -2549,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (do_refine) { const int search_range = 8; MV best_mv = *dst_mv; - thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, - ref_mv); + thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { @@ -2559,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, } } + if (sad_fn_ptr.sdf != fn_ptr->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run search with + // skip row features off. + const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv); + const int sad = + fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride); + const int skip_sad = + fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride); + // We will keep the result of skipping rows if it's good enough. + const int kSADThresh = + 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb, + further_steps, do_refine, 0, cost_list, fn_ptr, + ref_mv, dst_mv); + } + } + // Return cost list. if (cost_list) { calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); @@ -2710,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; @@ -2719,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; @@ -2736,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, best_address - 1, best_address + 1, best_address + in_what->stride }; - fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, + sads); for (j = 0; j < 4; ++j) { if (sads[j] < best_sad) { @@ -2756,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride); + sad_fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { @@ -2874,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, break; case NSTEP: case MESH: - var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, - cost_list, fn_ptr, ref_mv, tmp_mv); + var = full_pixel_diamond( + cpi, x, bsize, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv); break; default: assert(0 && "Unknown search method"); } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index bdaf2ce77..fd6a8b9ac 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -41,6 +41,11 @@ typedef struct search_site_config { int total_steps; } search_site_config; +typedef struct vp9_sad_table { + vpx_sad_fn_t sdf; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_sad_fn_ptr_t; + static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, const MV *mv) { return &buf->buf[mv->row * buf->stride + mv->col]; @@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, struct VP9_COMP; struct SPEED_FEATURES; +struct vp9_sad_table; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, int error_per_bit, int search_range, - const struct vp9_variance_vtable *fn_ptr, + const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. @@ -94,9 +100,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( - const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index 45659f2a9..0843cd97e 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -59,7 +59,7 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, int i; CHECK_MEM_ERROR( - cm, this_tile->row_base_thresh_freq_fact, + &cm->error, this_tile->row_base_thresh_freq_fact, (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, sizeof(*(this_tile->row_base_thresh_freq_fact)))); for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) @@ -85,7 +85,7 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { multi_thread_ctxt->allocated_tile_rows = tile_rows; multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; - CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue, + CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue, (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue))); #if CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index 9696529c5..4ee6e51ba 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { VPXMIN(cpi->consec_zero_mv[bl_index1], VPXMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); - // Only consider blocks that are likely steady background. i.e, have + // Only consider blocks that are likely steady background. i.e., have // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all // 4 sub-blocks for 16x16 block. And exclude this frame if diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 579b466ca..6f2524b36 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -566,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + struct macroblock_plane *const p_uv = &x->plane[i]; + struct macroblockd_plane *const pd_uv = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv); const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv); const int uv_bw = b_width_log2_lookup[uv_bsize]; const int uv_bh = b_height_log2_lookup[uv_bsize]; const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + const uint32_t uv_dc_thr = + pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = + pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf); int j = i - 1; vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); flag_preduv_computed[i - 1] = 1; - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride, + pd_uv->dst.buf, pd_uv->dst.stride, + &sse_uv[j]); if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) @@ -768,7 +771,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, for (r = 0; r < max_blocks_high; r += block_step) { for (c = 0; c < num_4x4_w; c += block_step) { if (c < max_blocks_wide) { - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -783,22 +786,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, switch (tx_size) { case TX_16X16: vpx_hadamard_16x16(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_hadamard_8x8(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } *skippable &= (*eob == 0); @@ -1395,8 +1395,8 @@ static void recheck_zeromv_after_denoising( RD_COST this_rdc; mi->mode = ZEROMV; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; - set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); + mi->ref_frame[1] = NO_REF_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->mv[0].as_int = 0; mi->interp_filter = EIGHTTAP; if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; @@ -1414,7 +1414,7 @@ static void recheck_zeromv_after_denoising( this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; - set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->interp_filter = ctx_den->best_pred_filter; if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; @@ -1678,7 +1678,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { bp->best_intra_tx_size = TX_SIZES; bp->best_pred_filter = EIGHTTAP; bp->best_mode_skip_txfm = SKIP_TXFM_NONE; - bp->best_second_ref_frame = NONE; + bp->best_second_ref_frame = NO_REF_FRAME; bp->best_pred = NULL; } @@ -1872,8 +1872,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_rd_cost_reset(&best_rdc); vp9_rd_cost_reset(rd_cost); mi->sb_type = bsize; - mi->ref_frame[0] = NONE; - mi->ref_frame[1] = NONE; + mi->ref_frame[0] = NO_REF_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; mi->tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -1933,15 +1933,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->use_svc && svc->force_zero_mode_spatial_ref && svc->spatial_layer_id > 0 && !gf_temporal_ref) { if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - if (vp9_is_scaled(sf)) { + struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; inter_layer_ref = LAST_FRAME; } } if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; - if (vp9_is_scaled(sf)) { + struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; inter_layer_ref = GOLDEN_FRAME; } @@ -2051,7 +2051,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int comp_pred = 0; int force_mv_inter_layer = 0; PREDICTION_MODE this_mode; - second_ref_frame = NONE; + second_ref_frame = NO_REF_FRAME; if (idx < num_inter_modes) { this_mode = ref_mode_set[idx].pred_mode; @@ -2628,7 +2628,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_pickmode.best_mode = this_mode; best_pickmode.best_intra_tx_size = mi->tx_size; best_pickmode.best_ref_frame = INTRA_FRAME; - best_pickmode.best_second_ref_frame = NONE; + best_pickmode.best_second_ref_frame = NO_REF_FRAME; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; @@ -2750,8 +2750,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const struct segmentation *const seg = &cm->seg; - MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE; - MV_REFERENCE_FRAME best_ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME; + MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; int64_t best_rd = INT64_MAX; @@ -2772,9 +2772,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; - const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; - vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, - sf); + const struct scale_factors *const ref_sf = + &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf, + ref_sf); vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, mbmi_ext->mode_context); @@ -2789,7 +2790,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, mi->tx_size = TX_4X4; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index dcc44449f..19edf166d 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -15,6 +15,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -22,12 +23,14 @@ #include "vp9/encoder/vp9_rd.h" void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -53,15 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i; int eob = -1; - - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -86,12 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -118,13 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { int i, eob = -1; - - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -249,8 +255,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // Y x->plane[0].quant = quants->y_quant[qindex]; x->plane[0].quant_fp = quants->y_quant_fp[qindex]; - memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex], - 8 * sizeof(*(x->plane[0].round_fp))); + x->plane[0].round_fp = quants->y_round_fp[qindex]; x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; @@ -262,8 +267,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; - memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex], - 8 * sizeof(*(x->plane[i].round_fp))); + x->plane[i].round_fp = quants->uv_round_fp[qindex]; x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index d9207f7a2..6452e349d 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -260,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) { // for the layered rate control which involves cumulative buffer levels for // the temporal layers. Allow for using the timestamp(pts) delta for the // framerate when the set_ref_frame_config is used. -static void update_buffer_level_svc_preencode(VP9_COMP *cpi) { +void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; int i; // Set this to 1 to use timestamp delta for "framerate" under @@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) { else q = qclamp; } - if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_limit_q(cpi, &q); return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); } @@ -1150,8 +1151,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (frame_is_intra_only(cm)) { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else if (rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached @@ -1195,7 +1197,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { q = rc->avg_frame_qindex[KEY_FRAME]; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; @@ -1206,12 +1208,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex; if (cpi->refresh_alt_ref_frame) - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth); else - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); @@ -1219,11 +1223,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0 }; int delta_qindex = vp9_compute_qdelta( - rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], + rc, qstart, + qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { @@ -1355,7 +1360,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, active_best_quality /= 4; } - // Dont allow the active min to be lossless (q0) unlesss the max q + // Don't allow the active min to be lossless (q0) unlesss the max q // already indicates lossless. active_best_quality = VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); @@ -1453,7 +1458,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, } else { q = active_worst_quality; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; } @@ -1859,8 +1864,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); if (cpi->use_svc) { - int i = 0; - SVC *svc = &cpi->svc; + int i; for (i = 0; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); @@ -1988,6 +1992,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) svc->lower_layer_qindex = cm->base_qindex; + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { @@ -2008,6 +2013,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; } + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { @@ -2033,7 +2039,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; - const int target = rc->avg_frame_bandwidth * kf_ratio; + int target = rc->avg_frame_bandwidth; + if (target > INT_MAX / kf_ratio) + target = INT_MAX; + else + target = rc->avg_frame_bandwidth * kf_ratio; return vp9_rc_clamp_iframe_target_size(cpi, target); } @@ -2111,7 +2121,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { int target; if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0)) { + rc->frames_to_key == 0 || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; @@ -2165,12 +2176,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { if (diff > 0) { // Lower the target bandwidth for this frame. const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct); - target -= (target * pct_low) / 200; + target -= (int)(((int64_t)target * pct_low) / 200); } else if (diff < 0) { // Increase the target bandwidth for this frame. const int pct_high = (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct); - target += (target * pct_high) / 200; + target += (int)(((int64_t)target * pct_high) / 200); } if (oxcf->rc_max_inter_bitrate_pct) { const int max_rate = @@ -2277,14 +2288,15 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { // Periodic key frames is based on the super-frame counter // (svc.current_superframe), also only base spatial layer is key frame. // Key frame is set for any of the following: very first frame, frame flags - // indicates key, superframe counter hits key frequency, or (non-intra) sync - // flag is set for spatial layer 0. + // indicates key, superframe counter hits key frequency,(non-intra) sync + // flag is set for spatial layer 0, or deadline mode changes. if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && (svc->current_superframe % cpi->oxcf.key_freq == 0) && !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || - (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; if (is_one_pass_svc(cpi)) { @@ -2438,7 +2450,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { vp9_cyclic_refresh_update_parameters(cpi); vp9_rc_set_frame_target(cpi, target); - if (cm->show_frame) update_buffer_level_svc_preencode(cpi); + if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi); if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && svc->spatial_layer_id == svc->first_spatial_layer_to_encode && @@ -2483,7 +2495,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int target; if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || - (cpi->oxcf.auto_key && rc->frames_to_key == 0)) { + (cpi->oxcf.auto_key && rc->frames_to_key == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; @@ -2636,7 +2649,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int vbr_max_bits; - rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); + rc->avg_frame_bandwidth = + (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX); rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); @@ -2690,7 +2704,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { } // Fast redistribution of bits arising from massive local undershoot. - // Dont do it for kf,arf,gf or overlay frames. + // Don't do it for kf,arf,gf or overlay frames. if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && rc->vbr_bits_off_target_fast) { int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target); @@ -3269,11 +3283,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { MODE_INFO **mi = cm->mi_grid_visible; int sum_intra_usage = 0; int mi_row, mi_col; - int tot = 0; for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; - tot++; mi++; } mi += 8; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 96a8fd3f1..48c49e937 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -350,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi); void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); +void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 58dd75b44..95c95971c 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } -static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE], - int r_q10[MAX_MB_PLANE], - int d_q10[MAX_MB_PLANE]) { - int i; - const int one_q10 = 1 << 10; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int tmp = (xsq_q10[i] >> 2) + 8; - const int k = get_msb(tmp) - 3; - const int xq = (k << 3) + ((tmp >> k) & 0x7); - const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k); - const int b_q10 = one_q10 - a_q10; - r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; - d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; - } -} - static const uint32_t MAX_XSQ_Q10 = 245727; void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, @@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, } } -// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where -// vectors are of length MAX_MB_PLANE and all elements of var are non-zero. -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum) { - int i; - int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE]; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const uint64_t xsq_q10_64 = - (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) / - var[i]; - xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); - } - model_rd_norm_vec(xsq_q10, r_q10, d_q10); - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate = - ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT); - int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10; - *rate_sum += rate; - *dist_sum += dist; - } -} - // Disable gcc 12.2 false positive warning. // warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] #if defined(__GNUC__) && !defined(__clang__) diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index d2bc5e60e..6c61ae514 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -121,11 +121,9 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int RDMULT; int RDDIV; double r0; @@ -166,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum); - int vp9_get_switchable_rate(const struct VP9_COMP *cpi, const MACROBLOCKD *const xd); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a464ce38f..974e43c90 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -77,7 +77,7 @@ struct rdcost_block_args { int64_t best_rd; int exit_early; int use_fast_coef_costing; - const scan_order *so; + const ScanOrder *so; uint8_t skippable; struct buf_2d *this_recon; }; @@ -86,28 +86,28 @@ struct rdcost_block_args { #if !CONFIG_REALTIME_ONLY static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - { NEARESTMV, { LAST_FRAME, NONE } }, - { NEARESTMV, { ALTREF_FRAME, NONE } }, - { NEARESTMV, { GOLDEN_FRAME, NONE } }, + { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE } }, + { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } }, - { NEWMV, { LAST_FRAME, NONE } }, - { NEWMV, { ALTREF_FRAME, NONE } }, - { NEWMV, { GOLDEN_FRAME, NONE } }, + { NEWMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { NEARMV, { LAST_FRAME, NONE } }, - { NEARMV, { ALTREF_FRAME, NONE } }, - { NEARMV, { GOLDEN_FRAME, NONE } }, + { NEARMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { ZEROMV, { LAST_FRAME, NONE } }, - { ZEROMV, { GOLDEN_FRAME, NONE } }, - { ZEROMV, { ALTREF_FRAME, NONE } }, + { ZEROMV, { LAST_FRAME, NO_REF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } }, { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { TM_PRED, { INTRA_FRAME, NONE } }, + { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } }, { NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { { ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE } }, - { V_PRED, { INTRA_FRAME, NONE } }, - { D135_PRED, { INTRA_FRAME, NONE } }, - { D207_PRED, { INTRA_FRAME, NONE } }, - { D153_PRED, { INTRA_FRAME, NONE } }, - { D63_PRED, { INTRA_FRAME, NONE } }, - { D117_PRED, { INTRA_FRAME, NONE } }, - { D45_PRED, { INTRA_FRAME, NONE } }, + { H_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { V_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } }, }; static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { - { { LAST_FRAME, NONE } }, { { GOLDEN_FRAME, NONE } }, - { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } }, - { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } }, + { { LAST_FRAME, NO_REF_FRAME } }, { { GOLDEN_FRAME, NO_REF_FRAME } }, + { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } }, + { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } }, }; #endif // !CONFIG_REALTIME_ONLY @@ -160,10 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } #if !CONFIG_REALTIME_ONLY -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb) { +// Planewise build inter prediction and compute rdcost with early termination +// option +static int build_inter_pred_model_rd_earlyterm( + VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -176,19 +179,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int64_t total_sse = 0; int skip_flag = 1; const int shift = 6; - int64_t dist; const int dequant_shift = #if CONFIG_VP9_HIGHBITDEPTH (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : #endif // CONFIG_VP9_HIGHBITDEPTH 3; - unsigned int qstep_vec[MAX_MB_PLANE]; - unsigned int nlog2_vec[MAX_MB_PLANE]; - unsigned int sum_sse_vec[MAX_MB_PLANE]; - int any_zero_sum_sse = 0; x->pred_sse[ref] = 0; + // Build prediction signal, compute stats and RD cost on per-plane basis for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; @@ -207,7 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int idx, idy; int lw = b_width_log2_lookup[unit_size] + 2; int lh = b_height_log2_lookup[unit_size] + 2; + unsigned int qstep; + unsigned int nlog2; + int64_t dist = 0; + // Build inter predictor + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + + // Compute useful stats for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); @@ -243,46 +249,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } total_sse += sum_sse; - sum_sse_vec[i] = sum_sse; - any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0); - qstep_vec[i] = pd->dequant[1] >> dequant_shift; - nlog2_vec[i] = num_pels_log2_lookup[bs]; - } + qstep = pd->dequant[1] >> dequant_shift; + nlog2 = num_pels_log2_lookup[bs]; - // Fast approximate the modelling function. - if (cpi->sf.simple_model_rd_from_var) { - for (i = 0; i < MAX_MB_PLANE; ++i) { + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { int64_t rate; - const int64_t square_error = sum_sse_vec[i]; - int quantizer = qstep_vec[i]; - - if (quantizer < 120) - rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT); + if (qstep < 120) + rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT); else rate = 0; - dist = (square_error * quantizer) >> 8; + dist = ((int64_t)sum_sse * qstep) >> 8; rate_sum += rate; - dist_sum += dist; - } - } else { - if (any_zero_sum_sse) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate; - vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i], - &rate, &dist); - rate_sum += rate; - dist_sum += dist; - } } else { - vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec, - &rate_sum, &dist_sum); + int rate; + vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist); + rate_sum += rate; + } + dist_sum += dist; + if (do_earlyterm) { + if (RDCOST(x->rdmult, x->rddiv, rate_sum, + dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd) + return 1; } } - *skip_txfm_sb = skip_flag; *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; + + return 0; } #endif // !CONFIG_REALTIME_ONLY @@ -462,11 +458,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, return cost; } -static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, - int subsampling_dim, int blk_dim) { - return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; -} - // Copy all visible 4x4s in the transform block. static void copy_block_visible(const MACROBLOCKD *xd, const struct macroblockd_plane *const pd, @@ -567,47 +558,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, return sse; } -// Compute the squares sum squares on all visible 4x4s in the transform block. -static int64_t sum_squares_visible(const MACROBLOCKD *xd, - const struct macroblockd_plane *const pd, - const int16_t *diff, const int diff_stride, - int blk_row, int blk_col, - const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { - int64_t sse; - const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; - const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; - int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, - pd->subsampling_x, blk_col); - int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, - pd->subsampling_y, blk_row); - if (tx_bsize == BLOCK_4X4 || - (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { - assert(tx_4x4_w == tx_4x4_h); - sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); - } else { - int r, c; - int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); - int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); - sse = 0; - // if we are in the unrestricted motion border. - for (r = 0; r < max_r; ++r) { - // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_c; ++c) { - sse += (int64_t)vpx_sum_squares_2d_i16( - diff + r * diff_stride * 4 + c * 4, diff_stride, 4); - } - } - } - return sse; -} - static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, TX_SIZE tx_size, int64_t *out_dist, - int64_t *out_sse, struct buf_2d *out_recon) { + int64_t *out_sse, struct buf_2d *out_recon, + int sse_calc_done) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -633,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, if (x->skip_encode && !is_inter_block(xd->mi[0])) { // TODO(jingning): tune the model to better capture the distortion. - const int64_t p = + const int64_t mean_quant_error = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> #if CONFIG_VP9_HIGHBITDEPTH (shift + 2 + (bd - 8) * 2); #else (shift + 2); #endif // CONFIG_VP9_HIGHBITDEPTH - *out_dist += (p >> 4); - *out_sse += p; + *out_dist += (mean_quant_error >> 4); + *out_sse += mean_quant_error; } } else { const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; @@ -657,8 +612,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); unsigned int tmp; - tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, - blk_col, plane_bsize, tx_bsize); + if (sse_calc_done) { + tmp = (unsigned int)(*out_sse); + } else { + tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, + blk_col, plane_bsize, tx_bsize); + } *out_sse = (int64_t)tmp * 16; if (out_recon) { const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col); @@ -754,20 +713,29 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dst_stride = pd->dst.stride; const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; + const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method; + const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh; + int sse_calc_done = 0; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip + }; +#endif if (args->exit_early) return; if (!is_inter_block(mi)) { -#if CONFIG_MISMATCH_DEBUG - struct encode_b_args intra_arg = { - x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0 - }; -#else - struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above, - args->t_left, &mi->skip }; -#endif vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, - &intra_arg); + &encode_b_arg); if (recon) { uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, @@ -775,16 +743,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, /*recon =*/0); + tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done); } else { const struct macroblock_plane *const p = &x->plane[plane]; const int src_stride = p->src.stride; - const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; - const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; unsigned int tmp; - sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, - plane_bsize, tx_bsize); + if (!sse_calc_done) { + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + int visible_width, visible_height; + sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, + plane_bsize, tx_bsize, &visible_width, + &visible_height); + } #if CONFIG_VP9_HIGHBITDEPTH if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2); @@ -808,12 +781,18 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, if (skip_txfm_flag == SKIP_TXFM_NONE || (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { + const struct macroblock_plane *const p = &x->plane[plane]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *const diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + const int use_trellis_opt = + do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize, + tx_size, &encode_b_arg); // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); + if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, recon); + tx_size, &dist, &sse, recon, sse_calc_done); } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); @@ -1149,13 +1128,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride, xd->bd); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); @@ -1166,16 +1144,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); if (tx_type == DCT_DCT) vpx_highbd_fdct4x4(src_diff, coeff, 8); else vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); distortion += vp9_highbd_block_error_dispatch( @@ -1256,13 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1273,13 +1249,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1416,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mic->mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, best_rd, /*recon = */ 0); + bsize, best_rd, /*recon=*/NULL); if (this_rate_tokenonly == INT_MAX) continue; @@ -1456,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (ref_best_rd < 0) is_cost_valid = 0; if (is_inter_block(mi) && is_cost_valid) { - int plane; for (plane = 1; plane < MAX_MB_PLANE; ++plane) vp9_subtract_plane(x, bsize, plane); } @@ -1469,7 +1443,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, - /*recon = */ 0); + /*recon=*/NULL); if (pnrate == INT_MAX) { is_cost_valid = 0; break; @@ -1652,7 +1626,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0, ref; - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int is_compound = has_second_ref(mi); const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; @@ -1732,14 +1706,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); #if CONFIG_VP9_HIGHBITDEPTH - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); thisdistortion += vp9_highbd_block_error_dispatch( coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); #else - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1833,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, const MV_REFERENCE_FRAME ref_frames[2]) { if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[this_mode][ref_frames[0]].as_int == 0 && - (ref_frames[1] == NONE || + (ref_frames[1] == NO_REF_FRAME || frame_mv[this_mode][ref_frames[1]].as_int == 0)) { int rfc = mode_context[ref_frames[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); @@ -1846,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, if (c2 > c3) return 0; } else { assert(this_mode == ZEROMV); - if (ref_frames[1] == NONE) { + if (ref_frames[1] == NO_REF_FRAME) { if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) return 0; @@ -1862,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi, return 1; } +static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { + if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { + int_mv cur_fullpel_mv, prev_fullpel_mv; + cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; + cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3; + prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3; + prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3; + if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1; + } + return 0; +} + +// Compares motion vector and mode rate of current mode and given mode. +static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv, + int this_mode_rate, int mode_rate, + int mv_thresh) { + const int mv_diff = + abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row); + if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1; + return 0; +} + +// Skips single reference inter modes NEARMV and ZEROMV based on motion vector +// difference and mode rate. +static INLINE int skip_single_mode_based_on_mode_rate( + int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode, + int ref0, int this_mode_rate, int best_mode_index) { + MV this_mv = mode_mv[this_mode][ref0].as_mv; + const int mv_thresh = 3; + + // Pruning is not applicable for NEARESTMV or NEWMV modes. + if (this_mode == NEARESTMV || this_mode == NEWMV) return 0; + // Pruning is not done when reference frame of the mode is same as best + // reference so far. + if (best_mode_index > 0 && + ref0 == vp9_mode_order[best_mode_index].ref_frame[0]) + return 0; + + // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV + if (compare_mv_mode_rate( + this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh)) + return 1; + + // Check absolute mv difference and mode rate of current mode w.r.t NEWMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh)) + return 1; + + // Pruning w.r.t NEARMV is applicable only for ZEROMV mode + if (this_mode == NEARMV) return 0; + // Check absolute mv difference and mode rate of current mode w.r.t NEARMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh)) + return 1; + return 0; +} + +#define MAX_JOINT_MV_SEARCH_ITERS 4 +static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) { + int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0 + if (sf_level >= 2) + num_iters = 0; + else if (sf_level >= 1) + num_iters = bsize < BLOCK_8X8 + ? 0 + : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS); + return num_iters; +} + static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], - int *rate_mv) { + int *rate_mv, int num_iters) { const VP9_COMMON *const cm = &cpi->common; const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; @@ -1874,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int refs[2] = { mi->ref_frame[0], mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] }; int_mv ref_mv[2]; + int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2]; int ite, ref; const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; struct scale_factors sf; @@ -1888,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Prediction buffer from second frame. #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]); + DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]); uint8_t *second_pred; #else - DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH + // Check number of iterations do not exceed the max + assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS); + for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; @@ -1909,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; + iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int; } // Since we have scaled the reference frames to match the size of the current @@ -1923,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. - for (ite = 0; ite < 4; ite++) { + for (ite = 0; ite < num_iters; ite++) { struct buf_2d ref_yv12[2]; uint32_t bestsme = UINT_MAX; int sadpb = x->sadperbit16; @@ -1935,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. + // Skip further iterations of search if in the previous iteration, the + // motion vector of the searched ref frame is unchanged, and the other ref + // frame's full-pixel mv is unchanged. + if (skip_iters(iter_mvs, ite, id)) break; + // Initialized here because of compiler problem in Visual Studio. ref_yv12[0] = xd->plane[0].pre[0]; ref_yv12[1] = xd->plane[0].pre[1]; @@ -2000,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } else { break; } + if (ite < num_iters - 1) { + iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int; + iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int; + } } *rate_mv = 0; @@ -2020,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, static int64_t rd_pick_best_sub8x8_mode( VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, - int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate, + int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate, int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse, int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) { @@ -2053,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode( vp9_zero(*bsi); - bsi->segment_rd = best_rd; + bsi->segment_rd = best_rd_so_far; bsi->ref_mv[0] = best_ref_mv; bsi->ref_mv[1] = second_best_ref_mv; bsi->mvp.as_int = best_ref_mv->as_int; @@ -2079,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode( int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; - const int i = idy * 2 + idx; + const int block = idy * 2 + idx; int ref; for (ref = 0; ref < 1 + has_second_rf; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; frame_mv[ZEROMV][frame].as_int = 0; vp9_append_sub8x8_mvs_for_idx( - cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], + cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], &frame_mv[NEARMV][frame], mbmi_ext->mode_context); } @@ -2096,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode( struct buf_2d orig_pre[2]; mode_idx = INTER_OFFSET(this_mode); - bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; + bsi->rdstat[block][mode_idx].brdcost = INT64_MAX; if (!(inter_mode_mask & (1 << this_mode))) continue; if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, @@ -2104,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode( continue; memcpy(orig_pre, pd->pre, sizeof(orig_pre)); - memcpy(bsi->rdstat[i][mode_idx].ta, t_above, - sizeof(bsi->rdstat[i][mode_idx].ta)); - memcpy(bsi->rdstat[i][mode_idx].tl, t_left, - sizeof(bsi->rdstat[i][mode_idx].tl)); + memcpy(bsi->rdstat[block][mode_idx].ta, t_above, + sizeof(bsi->rdstat[block][mode_idx].ta)); + memcpy(bsi->rdstat[block][mode_idx].tl, t_left, + sizeof(bsi->rdstat[block][mode_idx].tl)); // motion search for newmv (single predictor case only) if (!has_second_rf && this_mode == NEWMV && - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) { + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) { MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; uint32_t bestsme = UINT_MAX; @@ -2121,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode( int cost_list[5]; const MvLimits tmp_mv_limits = x->mv_limits; - /* Is the best so far sufficiently good that we cant justify doing + /* Is the best so far sufficiently good that we can't justify doing * and new motion search. */ if (best_rd < label_mv_thresh) break; if (cpi->oxcf.mode != BEST) { // use previous block's result as next block's MV predictor. - if (i > 0) { - bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; - if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int; + if (block > 0) { + bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int; + if (block == 2) + bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int; } } - if (i == 0) + if (block == 0) max_mv = x->max_mv_context[mi->ref_frame[0]]; else max_mv = @@ -2161,7 +2218,7 @@ static int64_t rd_pick_best_sub8x8_mode( } // adjust src pointer for this block - mi_buf_shift(x, i); + mi_buf_shift(x, block); vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); @@ -2184,7 +2241,7 @@ static int64_t rd_pick_best_sub8x8_mode( cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction - seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; + seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv; } x->pred_mv[mi->ref_frame[0]] = *new_mv; @@ -2194,40 +2251,44 @@ static int64_t rd_pick_best_sub8x8_mode( } if (has_second_rf) { - if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV || - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) + if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) continue; } if (has_second_rf && this_mode == NEWMV && mi->interp_filter == EIGHTTAP) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); // adjust src pointers - mi_buf_shift(x, i); - if (sf->comp_inter_joint_search_thresh <= bsize) { + mi_buf_shift(x, block); + if (num_joint_search_iters) { int rate_mv; joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, - mi_col, seg_mvs[i], &rate_mv); - seg_mvs[i][mi->ref_frame[0]].as_int = + mi_col, seg_mvs[block], &rate_mv, + num_joint_search_iters); + seg_mvs[block][mi->ref_frame[0]].as_int = frame_mv[this_mode][mi->ref_frame[0]].as_int; - seg_mvs[i][mi->ref_frame[1]].as_int = + seg_mvs[block][mi->ref_frame[1]].as_int = frame_mv[this_mode][mi->ref_frame[1]].as_int; } // restore src pointers mi_buf_restore(x, orig_src, orig_pre); } - bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs( - cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i], - bsi->ref_mv, x->nmvjointcost, x->mvcost); + bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs( + cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv, + seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost); for (ref = 0; ref < 1 + has_second_rf; ++ref) { - bsi->rdstat[i][mode_idx].mvs[ref].as_int = + bsi->rdstat[block][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; } @@ -2245,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode( for (ref = 0; ref < 1 + has_second_rf; ++ref) { subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (filter_idx > 1 && !subpelmv && !have_ref) { @@ -2253,53 +2314,55 @@ static int64_t rd_pick_best_sub8x8_mode( have_ref = 1; for (ref = 0; ref < 1 + has_second_rf; ++ref) have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (!subpelmv && have_ref && - ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], - sizeof(SEG_RDSTAT)); + ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + memcpy(&bsi->rdstat[block][mode_idx], + &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT)); if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = - ref_bsi->rdstat[i + 1][mode_idx].eobs; + bsi->rdstat[block + 1][mode_idx].eobs = + ref_bsi->rdstat[block + 1][mode_idx].eobs; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = - ref_bsi->rdstat[i + 2][mode_idx].eobs; + bsi->rdstat[block + 2][mode_idx].eobs = + ref_bsi->rdstat[block + 2][mode_idx].eobs; - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } continue; } } - bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment( - cpi, x, bsi->segment_rd - this_segment_rd, i, - &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist, - &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta, - bsi->rdstat[i][mode_idx].tl, mi_row, mi_col); - if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - bsi->rdstat[i][mode_idx].brdcost += - RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); - bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = p->eobs[i]; + bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment( + cpi, x, bsi->segment_rd - this_segment_rd, block, + &bsi->rdstat[block][mode_idx].byrate, + &bsi->rdstat[block][mode_idx].bdist, + &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta, + bsi->rdstat[block][mode_idx].tl, mi_row, mi_col); + if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[block][mode_idx].brdcost += RDCOST( + x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0); + bsi->rdstat[block][mode_idx].brate += + bsi->rdstat[block][mode_idx].byrate; + bsi->rdstat[block][mode_idx].eobs = p->eobs[block]; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1]; + bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1]; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2]; + bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2]; } - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } } /*for each 4x4 mode*/ if (best_rd == INT64_MAX) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2307,22 +2370,22 @@ static int64_t rd_pick_best_sub8x8_mode( } mode_idx = INTER_OFFSET(mode_selected); - memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); - memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); + memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above)); + memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left)); - set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected], - frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, - x->mvcost); + set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected, + mode_mv[mode_selected], frame_mv, seg_mvs[block], + bsi->ref_mv, x->nmvjointcost, x->mvcost); - br += bsi->rdstat[i][mode_idx].brate; - bd += bsi->rdstat[i][mode_idx].bdist; - block_sse += bsi->rdstat[i][mode_idx].bsse; - segmentyrate += bsi->rdstat[i][mode_idx].byrate; - this_segment_rd += bsi->rdstat[i][mode_idx].brdcost; + br += bsi->rdstat[block][mode_idx].brate; + bd += bsi->rdstat[block][mode_idx].bdist; + block_sse += bsi->rdstat[block][mode_idx].bsse; + segmentyrate += bsi->rdstat[block][mode_idx].byrate; + this_segment_rd += bsi->rdstat[block][mode_idx].brdcost; if (this_segment_rd > bsi->segment_rd) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2340,7 +2403,7 @@ static int64_t rd_pick_best_sub8x8_mode( // update the coding decisions for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; - if (bsi->segment_rd > best_rd) return INT64_MAX; + if (bsi->segment_rd > best_rd_so_far) return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { mode_idx = INTER_OFFSET(bsi->modes[i]); @@ -2585,9 +2648,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - xd->plane[i].pre[0] = backup_yv12[i]; + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[0] = backup_yv12[j]; } return; } @@ -2752,8 +2815,9 @@ static int64_t handle_inter_mode( struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], - int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, - const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) { + int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate, + int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, + int64_t filter_cache[], int best_mode_index) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; @@ -2771,9 +2835,8 @@ static int64_t handle_inter_mode( #else DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH - int pred_exists = 0; int intpel_mv; - int64_t rd, tmp_rd, best_rd = INT64_MAX; + int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX; int best_needs_copy = 0; uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; @@ -2782,13 +2845,12 @@ static int64_t handle_inter_mode( uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 }; int64_t bsse[MAX_MB_PLANE << 2] = { 0 }; - int bsl = mi_width_log2_lookup[bsize]; - int pred_filter_search = - cpi->sf.cb_pred_filter_search - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; + const int bsl = mi_width_log2_lookup[bsize]; + const int blk_parity = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + const int pred_filter_search = + (cpi->sf.cb_pred_filter_search >= 2) && blk_parity; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -2827,13 +2889,23 @@ static int64_t handle_inter_mode( if (this_mode == NEWMV) { int rate_mv; if (is_comp_pred) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); + // Initialize mv using single prediction mode result. frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + if (num_joint_search_iters) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, joint_motion_search_time); +#endif joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, - single_newmv, &rate_mv); + single_newmv, &rate_mv, num_joint_search_iters); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, joint_motion_search_time); +#endif } else { rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, @@ -2845,7 +2917,13 @@ static int64_t handle_inter_mode( *rate2 += rate_mv; } else { int_mv tmp_mv; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, single_motion_search_time); +#endif single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, single_motion_search_time); +#endif if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int = @@ -2899,23 +2977,45 @@ static int64_t handle_inter_mode( *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]); } + if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) { + single_mode_rate[INTER_OFFSET(this_mode)] = *rate2; + // Prune NEARMV and ZEROMV modes based on motion vector difference and mode + // rate. + if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, + this_mode, refs[0], *rate2, + best_mode_index)) { + // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes + // are not early terminated. This ensures all single modes are not getting + // skipped when the speed feature is enabled. + assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX || + single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX); + return INT64_MAX; + } + } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mi->mode != NEARESTMV) return INT64_MAX; - pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interp_filter_time); +#endif // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { + // Use cb pattern for filter eval when filter is not switchable + const int enable_interp_search = + (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE) + ? blk_parity + : 1; if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { best_filter = EIGHTTAP; - } else if (best_filter == SWITCHABLE) { + } else if (best_filter == SWITCHABLE && enable_interp_search) { int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; @@ -2925,6 +3025,9 @@ static int64_t handle_inter_mode( int64_t rs_rd; int tmp_skip_sb = 0; int64_t tmp_skip_sse = INT64_MAX; + const int enable_earlyterm = + cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i; + int64_t filt_best_rd; mi->interp_filter = i; rs = vp9_get_switchable_rate(cpi, xd); @@ -2958,9 +3061,16 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse); + + filt_best_rd = + cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; + if (build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum, + &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { + filter_cache[i] = INT64_MAX; + continue; + } rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); filter_cache[i] = rd; @@ -2993,7 +3103,6 @@ static int64_t handle_inter_mode( if ((cm->interp_filter == SWITCHABLE && newbest) || (cm->interp_filter != SWITCHABLE && cm->interp_filter == mi->interp_filter)) { - pred_exists = 1; tmp_rd = best_rd; skip_txfm_sb = tmp_skip_sb; @@ -3005,12 +3114,15 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interp_filter_time); +#endif // Set the appropriate filter mi->interp_filter = cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0; - if (pred_exists) { + if (tmp_rd != INT64_MAX) { if (best_needs_copy) { // again temporarily set the buffers to local memory to prevent a memcpy for (i = 0; i < MAX_MB_PLANE; i++) { @@ -3025,9 +3137,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, - &skip_sse_sb); + build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, + &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); @@ -3120,7 +3232,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, x->skip_encode = 0; ctx->skip = 0; xd->mi[0]->ref_frame[0] = INTRA_FRAME; - xd->mi[0]->ref_frame[1] = NONE; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; // Initialize interp_filter here so we do not have to check for inter block // modes in get_pred_context_switchable_interp() xd->mi[0]->interp_filter = SWITCHABLE_FILTERS; @@ -3344,6 +3456,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_mode_rate[MAX_REF_FRAMES][INTER_MODES]; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3493,7 +3606,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; } - if (bsize > sf->max_intra_bsize) { + if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) { ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); } @@ -3542,6 +3655,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; vp9_zero(x->sum_y_eobs); + comp_pred = second_ref_frame > INTRA_FRAME; + if (!comp_pred && ref_frame != INTRA_FRAME && + sf->prune_single_mode_based_on_mv_diff_mode_rate) + single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX; if (is_rect_partition) { if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; @@ -3560,7 +3677,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; break; case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -3593,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, MODE_INFO *ref_mi; int const_motion = 1; int skip_ref_frame = !cb_partition_search_ctrl; - MV_REFERENCE_FRAME rf = NONE; + MV_REFERENCE_FRAME rf = NO_REF_FRAME; int_mv ref_mv; ref_mv.as_int = INVALID_MV; @@ -3610,7 +3727,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if ((mi_col - 1) >= tile_info->mi_col_start) { if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0]; - if (rf == NONE) rf = xd->mi[-1]->ref_frame[0]; + if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0]; for (i = 0; i < mi_height; ++i) { ref_mi = xd->mi[i * xd->mi_stride - 1]; const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) && @@ -3627,7 +3744,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (this_mode == NEARMV || this_mode == ZEROMV) continue; } - comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; @@ -3707,19 +3823,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, best_rd, recon); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif if (rate_y == INT_MAX) continue; uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] [pd->subsampling_y]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); } - +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif rate_uv = rate_uv_tokenonly[uv_tx]; distortion_uv = dist_uv[uv_tx]; skippable = skippable && skip_uv[uv_tx]; @@ -3730,11 +3857,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, - single_inter_filter, single_skippable, &total_sse, best_rd, - &mask_filter, filter_cache); + single_inter_filter, single_skippable, + &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter, + filter_cache, best_mode_index); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif if (this_rd == INT64_MAX) continue; compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); @@ -3970,13 +4104,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { -// If adaptive interp filter is enabled, then the current leaf node of 8x8 -// data is needed for sub8x8. Hence preserve the context. -#if CONFIG_CONSISTENT_RECODE + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#else - if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -4091,7 +4221,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mode = ZEROMV; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; x->skip = 1; @@ -4236,7 +4366,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable = 0; - int i; int this_skip2 = 0; int64_t total_sse = INT_MAX; int early_term = 0; @@ -4274,7 +4403,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, case ALTREF_FRAME: ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME); break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -4397,7 +4526,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, : NULL; if (scaled_ref_frame[ref]) { - int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // motion search code to be used without additional modifications. @@ -4534,14 +4662,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (tmp_best_rdu > 0) { // If even the 'Y' rd value of split is higher than best so far - // then dont bother looking at UV + // then don't bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu)) { for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } @@ -4558,7 +4685,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { // Restore the prediction frame pointers to their unscaled versions. - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } @@ -4764,7 +4890,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; } // If the second reference does not exist, set the corresponding mv to zero. - if (mi->ref_frame[1] == NONE) { + if (mi->ref_frame[1] == NO_REF_FRAME) { mi->mv[1].as_int = 0; for (i = 0; i < 4; ++i) { mi->bmi[i].as_mv[1].as_int = 0; diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index 7486dee25..ca55ec988 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) { while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { ++steps; in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } } return steps; } diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 0431d8a45..56fb5f94f 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -16,8 +16,11 @@ #include "vpx_dsp/vpx_dsp_common.h" // Mesh search patters for various speed settings -static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { - { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } +// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non +// FC_GRAPHICS_ANIMATION content type. +static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = { + { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, }; #if !CONFIG_REALTIME_ONLY @@ -39,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) { // Sets a partition size down to which the auto partition code will always // search (can go lower), based on the image dimensions. The logic here // is that the extent to which ringing artefacts are offensive, depends -// partly on the screen area that over which they propogate. Propogation is +// partly on the screen area that over which they propagate. Propagation is // limited by transform block size but the screen area take up by a given block // size will be larger for a small image format stretched to full screen. static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) { @@ -67,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, const int is_720p_or_larger = min_frame_size >= 720; const int is_1080p_or_larger = min_frame_size >= 1080; const int is_2160p_or_larger = min_frame_size >= 2160; + const int boosted = frame_is_boosted(cpi); // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); @@ -78,9 +82,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, // Currently, the machine-learning based partition search early termination // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. sf->rd_ml_partition.search_early_termination = 1; + sf->recode_tolerance_high = 45; } else { sf->use_square_only_thresh_high = BLOCK_32X32; } + if (is_720p_or_larger) { + sf->alt_ref_search_fp = 1; + } if (!is_1080p_or_larger) { sf->rd_ml_partition.search_breakout = 1; @@ -95,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } } + if (!is_720p_or_larger) { + if (is_480p_or_larger) + sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1; + else + sf->prune_single_mode_based_on_mv_diff_mode_rate = 1; + } + if (speed >= 1) { sf->rd_ml_partition.search_early_termination = 0; sf->rd_ml_partition.search_breakout = 1; @@ -152,7 +167,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->alt_ref_search_fp = 1; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->adaptive_interp_filter_search = 1; sf->disable_split_mask = DISABLE_ALL_SPLIT; } @@ -209,15 +224,32 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, const int boosted = frame_is_boosted(cpi); int i; - sf->tx_size_search_breakout = 1; + sf->adaptive_interp_filter_search = 1; + sf->adaptive_pred_interp_filter = 1; sf->adaptive_rd_thresh = 1; sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !boosted; + sf->mv.auto_mv_step_size = 1; + sf->mv.use_downsampled_sad = 1; sf->prune_ref_frame_for_rect_partitions = 1; - sf->rd_ml_partition.var_pruning = 1; + sf->temporal_filter_search_method = NSTEP; + sf->tx_size_search_breakout = 1; + sf->use_square_partition_only = !boosted; + sf->early_term_interp_search_plane_rd = 1; + sf->cb_pred_filter_search = 1; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0; + + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->comp_inter_joint_search_iter_level = 1; + + // Reference masking is not supported in dynamic scaling mode. + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; + sf->rd_ml_partition.var_pruning = 1; sf->rd_ml_partition.prune_rect_thresh[0] = -1; sf->rd_ml_partition.prune_rect_thresh[1] = 350; sf->rd_ml_partition.prune_rect_thresh[2] = 325; @@ -238,7 +270,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 1) { - sf->temporal_filter_search_method = NSTEP; sf->rd_ml_partition.var_pruning = !boosted; sf->rd_ml_partition.prune_rect_thresh[1] = 225; sf->rd_ml_partition.prune_rect_thresh[2] = 225; @@ -258,19 +289,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5]; sf->less_rectangular_check = 1; sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; sf->mv.subpel_search_level = 1; if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; - sf->adaptive_pred_interp_filter = 1; sf->allow_acl = 0; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; if (cpi->oxcf.content != VP9E_CONTENT_FILM) { sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; @@ -296,18 +326,14 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; - // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0; - sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->disable_filter_search_var_thresh = 100; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; sf->enhanced_full_pixel_motion_search = 0; sf->prune_ref_frame_for_rect_partitions = 0; @@ -337,14 +363,13 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->alt_ref_search_fp = 1; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; - sf->adaptive_interp_filter_search = 1; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { @@ -373,7 +398,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 5) { - int i; sf->optimize_coefficients = 0; sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; @@ -461,8 +485,8 @@ static void set_rt_speed_feature_framesize_independent( if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = 0.0; - sf->allow_quant_coeff_opt = 0; - sf->quant_opt_thresh = 0.0; + sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 0.0; sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = @@ -507,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent( } sf->disable_filter_search_var_thresh = 50; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; @@ -631,7 +655,7 @@ static void set_rt_speed_feature_framesize_independent( sf->use_altref_onepass = 1; sf->use_compound_nonrd_pickmode = 1; } - if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2; if (!cpi->external_resize) sf->use_source_sad = 1; } @@ -652,7 +676,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || svc->spatial_layer_id == svc->number_spatial_layers - 1)) { - CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd, + CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd, (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t))); @@ -721,7 +745,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && svc->temporal_layer_id > 0) cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); - if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2; } if (speed >= 8) { @@ -765,7 +789,7 @@ static void set_rt_speed_feature_framesize_independent( } sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; - if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2; } if (speed >= 9) { @@ -775,7 +799,7 @@ static void set_rt_speed_feature_framesize_independent( for (i = 0; i < BLOCK_SIZES; ++i) sf->intra_y_mode_bsize_mask[i] = INTRA_DC; } - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->mv.enable_adaptive_subpel_force_stop = 1; sf->mv.adapt_subpel_force_stop.mv_thresh = 1; sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; @@ -808,13 +832,13 @@ static void set_rt_speed_feature_framesize_independent( } if (cpi->count_arf_frame_usage == NULL) { CHECK_MEM_ERROR( - cm, cpi->count_arf_frame_usage, + &cm->error, cpi->count_arf_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_arf_frame_usage))); } if (cpi->count_lastgolden_frame_usage == NULL) CHECK_MEM_ERROR( - cm, cpi->count_lastgolden_frame_usage, + &cm->error, cpi->count_lastgolden_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_lastgolden_frame_usage))); } @@ -835,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent( // off for now. if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) cpi->oxcf.aq_mode = 0; + // For all speeds for rt mode: if the deadline mode changed (was good/best + // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to + // avoid entering rd pickmode. This causes issues, such as: b/310663186. + if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame) + sf->nonrd_keyframe = 1; } void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) { @@ -904,14 +933,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; sf->mv.fullpel_search_step_param = 6; - sf->comp_inter_joint_search_thresh = BLOCK_4X4; + sf->mv.use_downsampled_sad = 0; + sf->comp_inter_joint_search_iter_level = 0; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; + sf->prune_single_mode_based_on_mv_diff_mode_rate = 0; sf->cb_pred_filter_search = 0; + sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; sf->motion_field_mode_search = 0; sf->alt_ref_search_fp = 0; @@ -936,8 +968,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->adaptive_interp_filter_search = 0; sf->allow_txfm_domain_distortion = 0; sf->tx_domain_thresh = 99.0; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = 99.0; + sf->trellis_opt_tx_rd.method = + sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 99.0; sf->allow_acl = 1; sf->enable_tpl_model = oxcf->enable_tpl_model; sf->prune_ref_frame_for_rect_partitions = 0; @@ -991,10 +1024,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) : INT_MAX; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + { + const int mesh_density_level = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1; for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; - sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; + sf->mesh_patterns[i].range = + best_quality_mesh_pattern[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + best_quality_mesh_pattern[mesh_density_level][i].interval; } } diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index c2ae970b7..941de639a 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES { // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 8 rows. + int use_downsampled_sad; } MV_SPEED_FEATURES; typedef struct PARTITION_SEARCH_BREAKOUT_THR { @@ -246,6 +250,24 @@ typedef enum { USE_8_TAPS_SHARP, } SUBPEL_SEARCH_TYPE; +typedef enum { + // Disable trellis coefficient optimization + DISABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization + ENABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization based on source variance of the + // prediction block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR, + // Enable trellis coefficient optimization based on residual mse of the + // transform block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE, +} ENABLE_TRELLIS_OPT_METHOD; + +typedef struct TRELLIS_OPT_CONTROL { + ENABLE_TRELLIS_OPT_METHOD method; + double thresh; +} TRELLIS_OPT_CONTROL; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -264,11 +286,20 @@ typedef struct SPEED_FEATURES { // adds overhead. int static_segmentation; - // If 1 we iterate finding a best reference for 2 ref frames together - via - // a log search that iterates 4 times (check around mv for last for best - // error of combined predictor then check around mv for alt). If 0 we - // we just use the best motion vector found for each frame by itself. - BLOCK_SIZE comp_inter_joint_search_thresh; + // The best compound predictor is found using an iterative log search process + // that searches for best ref0 mv using error of combined predictor and then + // searches for best ref1 mv. This sf determines the number of iterations of + // this process based on block size. The sf becomes more aggressive from level + // 0 to 2. The following table indicates the number of iterations w.r.t bsize: + // ----------------------------------------------- + // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 | + // | 0 | 4 | 4 | 4 | + // | 1 | 0 | 2 | 4 | + // | 2 | 0 | 0 | 0 | + // ----------------------------------------------- + // Here, 0 iterations indicate using the best single motion vector selected + // for each ref frame without any iterative refinement. + int comp_inter_joint_search_iter_level; // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. @@ -292,8 +323,8 @@ typedef struct SPEED_FEATURES { int coeff_prob_appx_step; // Enable uniform quantizer followed by trellis coefficient optimization - int allow_quant_coeff_opt; - double quant_opt_thresh; + // during transform RD + TRELLIS_OPT_CONTROL trellis_opt_tx_rd; // Enable asymptotic closed-loop encoding decision for key frame and // alternate reference frames. @@ -399,9 +430,21 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - // Chessboard pattern prediction filter type search + // Prune NEAREST and ZEROMV single reference modes based on motion vector + // difference and mode rate + int prune_single_mode_based_on_mv_diff_mode_rate; + + // Chessboard pattern prediction for interp filter. Aggressiveness increases + // with levels. + // 0: disable + // 1: cb pattern in eval when filter is not switchable + // 2: cb pattern prediction for filter search int cb_pred_filter_search; + // This variable enables an early termination of interpolation filter eval + // based on the current rd cost after processing each plane + int early_term_interp_search_plane_rd; + int cb_partition_search; int motion_field_mode_search; @@ -600,7 +643,7 @@ typedef struct SPEED_FEATURES { // Use machine learning based partition search. int nonrd_use_ml_partition; - // Multiplier for base thresold for variance partitioning. + // Multiplier for base threshold for variance partitioning. int variance_part_thresh_mult; // Force subpel motion filter to always use SMOOTH_FILTER. diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 7e9435fb5..fff6d25de 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; - int i; lc->current_video_frame_in_layer = 0; lc->layer_size = 0; lc->frames_from_key_frame = 0; @@ -164,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lc->actual_num_seg1_blocks = 0; lc->actual_num_seg2_blocks = 0; lc->counter_encode_maxq_scene_change = 0; - CHECK_MEM_ERROR(cm, lc->map, + CHECK_MEM_ERROR(&cm->error, lc->map, vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); last_coded_q_map_size = mi_rows * mi_cols * sizeof(*lc->last_coded_q_map); - CHECK_MEM_ERROR(cm, lc->last_coded_q_map, + CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map, vpx_malloc(last_coded_q_map_size)); assert(MAXQ <= 255); memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv); - CHECK_MEM_ERROR(cm, lc->consec_zero_mv, + CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv, vpx_malloc(consec_zero_mv_size)); memset(lc->consec_zero_mv, 0, consec_zero_mv_size); } @@ -220,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } lrc->starting_buffer_level = - (int64_t)(rc->starting_buffer_level * bitrate_alloc); + (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5); lrc->optimal_buffer_level = - (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5); lrc->maximum_buffer_size = - (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5); lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; lrc->worst_quality = rc->worst_quality; lrc->best_quality = rc->best_quality; @@ -252,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } // Update buffer-related quantities. lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); @@ -269,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } else { lc->framerate = cpi->framerate; } - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; // Update qp-related quantities. lrc->worst_quality = rc->worst_quality; @@ -311,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const int tl = svc->temporal_layer_id; lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). if (tl == 0) { @@ -333,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { RATE_CONTROL *const lrc = &lc->rc; lc->framerate = framerate; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * @@ -389,6 +396,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->twopass = cpi->twopass; lc->target_bandwidth = (int)oxcf->target_bandwidth; lc->alt_ref_source = cpi->alt_ref_source; + lc->frame_qp = cpi->common.base_qindex; + lc->MBs = cpi->common.MBs; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. @@ -408,6 +417,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + lc->qindex_delta[0] = cr->qindex_delta[0]; + lc->qindex_delta[1] = cr->qindex_delta[1]; + lc->qindex_delta[2] = cr->qindex_delta[2]; } } @@ -790,9 +802,9 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { for (sl = svc->number_spatial_layers - 1; sl >= svc->first_spatial_layer_to_encode; sl--) { int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id; - LAYER_CONTEXT *const lc = &svc->layer_context[layer]; - cpi->rc = lc->rc; - cpi->oxcf.target_bandwidth = lc->target_bandwidth; + LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer]; + cpi->rc = sl_lc->rc; + cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth; if (vp9_test_drop(cpi)) { int sl2; // Set flag to force drop in encoding for this mode. @@ -1041,17 +1053,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { int sl, tl; for (sl = 0; sl < svc->number_spatial_layers; ++sl) { // Check for reset based on avg_frame_bandwidth for spatial layer sl. - int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, - svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; + const int spatial_layer_idx = LAYER_IDS_TO_IDX( + sl, svc->number_temporal_layers - 1, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx]; RATE_CONTROL *lrc = &lc->rc; if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) || lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { // Reset for all temporal layers with spatial layer sl. for (tl = 0; tl < svc->number_temporal_layers; ++tl) { - int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; - RATE_CONTROL *lrc = &lc->rc; + int temporal_layer_idx = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + lrc = &svc->layer_context[temporal_layer_idx].rc; lrc->rc_1_frame = 0; lrc->rc_2_frame = 0; lrc->bits_off_target = lrc->optimal_buffer_level; @@ -1137,7 +1149,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; // For fixed/non-flexible mode, the following constraint are expected, - // when inter-layer prediciton is on (default). + // when inter-layer prediction is on (default). if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && svc->framedrop_mode != LAYER_DROP) { @@ -1338,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { } } } + +// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. +// No need to set svc.skip_enhancement_layer if whole superframe will be +// dropped. +int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) { + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || + cpi->svc + .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - + 1]) && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + vp9_inc_frame_in_layer(cpi); + return 1; + } + return 0; +} diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index c7328cf57..388a02789 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -70,8 +70,11 @@ typedef struct { int actual_num_seg1_blocks; int actual_num_seg2_blocks; int counter_encode_maxq_scene_change; + int qindex_delta[3]; uint8_t speed; int loopfilter_ctrl; + int frame_qp; + int MBs; } LAYER_CONTEXT; typedef struct SVC { @@ -278,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); + +int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8af30c42a..986553a4a 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c( // Apply the filter to luma for (row = 0; row < (int)block_height; row++) { for (col = 0; col < (int)block_width; col++) { - const int uv_row = row >> ss_y; - const int uv_col = col >> ss_x; const int filter_weight = get_filter_weight( row, col, block_height, block_width, blk_fw, use_32x32); @@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c( // Sum the corresponding uv pixels to the current y modifier // Note we are rounding down instead of rounding to the nearest pixel. + uv_row = row >> ss_y; + uv_col = col >> ss_x; y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h index 7dcedda19..8776dfc06 100644 --- a/vp9/encoder/x86/temporal_filter_constants.h +++ b/vp9/encoder/vp9_temporal_filter_constants.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ -#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ #include "./vpx_config.h" // Division using multiplication and shifting. The C implementation does: @@ -407,4 +407,4 @@ static const uint32_t #define DIST_STRIDE ((BW) + 2) -#endif // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 814d769be..6c6c04493 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col, const PLANE_TYPE type = get_plane_type(plane); const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int16_t *scan, *nb; - const scan_order *so; + const ScanOrder *so; const int ref = is_inter_block(mi); unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = td->rd_counts.coef_counts[tx_size][type][ref]; diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c new file mode 100644 index 000000000..b8910370e --- /dev/null +++ b/vp9/encoder/vp9_tpl_model.c @@ -0,0 +1,1541 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> + +#include "./vpx_dsp_rtcd.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tpl_model.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_codec.h" + +static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } + + return extend_frame_count; +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) { + int frame_idx; + for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) { + vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} + +static void init_tpl_stats_before_propagation( + struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats, + TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width, + int frame_height) { + int frame_idx; + free_tpl_frame_stats_list(tpl_gop_stats); + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list, + vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list))); + tpl_gop_stats->size = tpl_gop_frames; + for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) { + const int mi_rows = tpl_stats[frame_idx].height; + const int mi_cols = tpl_stats[frame_idx].width; + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list, + vpx_calloc( + mi_rows * mi_cols, + sizeof( + *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list))); + tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols; + tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width; + tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, + int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + uint32_t bestsme = UINT_MAX; + const MvLimits tmp_mv_limits = x->mv_limits; + // lambda is used to adjust the importance of motion vector consistency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; + int nb_full_mv_num; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, mv); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return bestsme; +} + +static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + + MV best_ref_mv1 = { 0, 0 }; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} +#endif + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, + TplDepStats *tpl_stats, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int stride, int64_t recon_error, + int64_t rate_cost, int ref_frame_idx) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + VpxTplBlockStats *tpl_block_stats_ptr = + &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; + tpl_block_stats_ptr->row = mi_row * 8; + tpl_block_stats_ptr->col = mi_col * 8; + tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; + tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; + tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; + tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse, uint16_t *eob) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; + int rate_cost = 1; + int idx; + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + for (idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << VP9_PROB_COST_SHIFT); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *rate_cost, + int64_t *sse, int *ref_frame_idx) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int_mv mv; +#if CONFIG_NON_GREEDY_MV + MotionField *motion_field; +#endif + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); +#else + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + uint16_t eob = 0; + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + // Since best_inter_cost is initialized as INT64_MAX, recon_error and + // rate_cost will be calculated with the best reference frame. + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse, &eob); + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; + *ref_frame_idx = best_rf_idx; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, + mi_row, mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, + tpl_frame, bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = + get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagonal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode( + NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col); + } + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + assert(ref_frame != NULL); + set_mv_limits(cm, x, mi_row, mi_col); + { + int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; + uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; + const int stride = xd->cur_buf->y_stride; + full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, + ref_frame_buf, stride, bsize, mi_row, mi_col, + &mv.as_mv); + sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, + bsize, &mv.as_mv); + vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); + } +} + +static void build_motion_field( + VP9_COMP *cpi, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int mi_row, mi_col; + int rf_idx; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + if (ref_frame[rf_idx] == NULL) { + continue; + } + vp9_motion_field_reset_mvs(motion_field); + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], + bsize, mi_row, mi_col); + } + } + } +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + VpxTplFrameStats *tpl_frame_stats_before_propagation = + &cpi->tpl_gop_stats.frame_stats_list[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + tpl_frame_stats_before_propagation->frame_width = cm->width; + tpl_frame_stats_before_propagation->frame_height = cm->height; + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + { + int square_block_idx; + int rf_idx; + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } + } + } +#endif // CONFIG_NON_GREEDY_MV + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int64_t recon_error = 0; + int64_t rate_cost = 0; + int64_t sse = 0; + // Ref frame index in the ref frame buffer. + int ref_frame_idx = -1; + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &rate_cost, + &sse, &ref_frame_idx); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_store_before_propagation( + tpl_frame_stats_before_propagation->block_stats_list, + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, + recon_error, rate_cost, ref_frame_idx); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +static void trim_tpl_stats(struct vpx_internal_error_info *error_info, + VpxTplGopStats *tpl_gop_stats, int extra_frames) { + int i; + VpxTplFrameStats *new_frame_stats; + const int new_size = tpl_gop_stats->size - extra_frames; + if (tpl_gop_stats->size <= extra_frames) + vpx_internal_error( + error_info, VPX_CODEC_ERROR, + "The number of frames in VpxTplGopStats is fewer than expected."); + CHECK_MEM_ERROR(error_info, new_frame_stats, + vpx_calloc(new_size, sizeof(*new_frame_stats))); + for (i = 0; i < new_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats->num_blocks; + new_frame_stats[i].num_blocks = frame_stats->num_blocks; + new_frame_stats[i].frame_width = frame_stats->frame_width; + new_frame_stats[i].frame_height = frame_stats->frame_height; + new_frame_stats[i].num_blocks = num_blocks; + CHECK_MEM_ERROR( + error_info, new_frame_stats[i].block_stats_list, + vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list))); + memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list, + num_blocks * sizeof(*new_frame_stats[i].block_stats_list)); + } + free_tpl_frame_stats_list(tpl_gop_stats); + tpl_gop_stats->size = new_size; + tpl_gop_stats->frame_stats_list = new_frame_stats; +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, + frame_idx, rf_idx, bsize, + mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +void vp9_init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int rf_idx; + + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + &cm->error, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +void vp9_free_tpl_buffer(VP9_COMP *cpi) { + int frame; +#if CONFIG_NON_GREEDY_MV + vp9_free_motion_field_info(&cpi->motion_field_info); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } + free_tpl_frame_stats_list(&cpi->tpl_gop_stats); +} + +#if CONFIG_RATE_CTRL +static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int show_frame_count = 0; + int frame_idx; + // Accumulate tpl stats for each frame in the current group of picture. + for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t inter_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_ref_cost_base = 0; + int64_t mc_flow_base = 0; + int row, col; + + if (!tpl_frame->is_valid) continue; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + inter_cost_base += this_stats->inter_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + mc_ref_cost_base += this_stats->mc_ref_cost; + mc_flow_base += this_stats->mc_flow; + } + } + + cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; + cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; + + ++show_frame_count; + } +} +#endif // CONFIG_RATE_CTRL + +void vp9_setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + int extended_frame_count; + cpi->tpl_bsize = BLOCK_32X32; + + extended_frame_count = + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats, + cpi->tpl_stats, tpl_group_frames, + cpi->common.width, cpi->common.height); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } + + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); + + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { + const vpx_codec_err_t codec_status = + vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cpi->common.error, codec_status, + "vp9_extrc_send_tpl_stats() failed"); + } + } + +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + accumulate_frame_tpl_stats(cpi); + } +#endif // CONFIG_RATE_CTRL +} diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h new file mode 100644 index 000000000..04beb2261 --- /dev/null +++ b/vp9/encoder/vp9_tpl_model.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ +#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +#define TPL_DEP_COST_SCALE_LOG2 4 + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +void vp9_init_tpl_buffer(VP9_COMP *cpi); +void vp9_setup_tpl_stats(VP9_COMP *cpi); +void vp9_free_tpl_buffer(VP9_COMP *cpi); + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c index a7f5117cf..97f182c66 100644 --- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c +++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -16,7 +16,7 @@ #include "vpx/vpx_integer.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_temporal_filter.h" -#include "vp9/encoder/x86/temporal_filter_constants.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" // Compute (a-b)**2 for 8 pixels with size 16-bit static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, @@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); - pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); - pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32); + pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32); + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index 87e68fb43..7571bfcca 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -16,7 +16,7 @@ #include "vpx/vpx_integer.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_temporal_filter.h" -#include "vp9/encoder/x86/temporal_filter_constants.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the // difference squared, and store as unsigned 16-bit integer to dst. diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c deleted file mode 100644 index 0e04a2f41..000000000 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#if defined(_MSC_VER) -#include <intrin.h> -#endif -#include <emmintrin.h> -#include <smmintrin.h> - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -#define LIKELY(v) __builtin_expect(v, 1) -#define UNLIKELY(v) __builtin_expect(v, 0) -#else -#define LIKELY(v) (v) -#define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - -/***************************************************************************** - * This function utilizes 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { - const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); - const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int); - - const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); - - const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); - const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = - pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); - - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = - x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if VPX_ARCH_X86_64 - __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - unsigned int best_sad; - int i, j, step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w; -#if VPX_ARCH_X86_64 - __m128i v_blocka[2]; -#else - __m128i v_blocka[1]; -#endif - - // Compute the candidate motion vectors - const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]); - const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - __m128i v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); - - // If none of them are inside, then move on - if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff)); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = _mm_srli_epi32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if VPX_ARCH_X86_64 // sizeof(intptr_t) == 8 - // Load the offsets - __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]); - __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]); - // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = - _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); - // Compute the candidate addresses - v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); - v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); -#else // VPX_ARCH_X86 // sizeof(intptr_t) == 4 - __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]); - v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); - v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); -#endif - } - - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); - const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); - const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); - const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); - const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); - const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); - const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); - const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); - - // Note: This is a use case for vpgather in AVX2 - const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; - const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; - const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; - const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; - - __m128i v_cost_10_d, v_cost_32_d; - v_cost_10_d = _mm_cvtsi32_si128(cost0); - v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); - v_cost_32_d = _mm_cvtsi32_si128(cost2); - v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); - v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); - } - - // Now add in the joint cost - { - const __m128i v_sel_d = - _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); - const __m128i v_joint_cost_d = - _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); - v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) - v_cost_d = _mm_add_epi32(v_cost_d, - _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1))); - v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT); - // Add the cost to the sad - v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); - - // Find the minimum value and index horizontally in v_sad_d - { - // Try speculatively on 16 bits, so we can use the minpos intrinsic - const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); - const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); - - uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); - uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); - - // If the local best value is not saturated, just use it, otherwise - // find the horizontal minimum again the hard way on 32 bits. - // This is executed rarely. - if (UNLIKELY(local_best_sad == 0xffff)) { - __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - - v_loval_d = v_sad_d; - v_loidx_d = _mm_set_epi32(3, 2, 1, 0); - v_hival_d = _mm_srli_si128(v_loval_d, 8); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - v_hival_d = _mm_srli_si128(v_loval_d, 4); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - - local_best_sad = _mm_extract_epi32(v_loval_d, 0); - local_best_idx = _mm_extract_epi32(v_loidx_d, 0); - } - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = _mm_set1_epi32((int)bmv.as_int); -#if VPX_ARCH_X86_64 - v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index bf0e8b121..94506aad0 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, // It's used to choose the src offset and filter coefficient offset. const int offset_idx1 = (offset1_q4 >> 4) & 1; const int offset_idx2 = (offset2_q4 >> 4) & 1; - static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + static const shuffle_filter_funcs kShuffleFilterFuncs[2] = { shuffle_filter_ssse3, shuffle_filter_odd_ssse3 }; - static const convolve8_funcs convolve8_funcs[2] = { + static const convolve8_funcs kConvolve8Funcs[2] = { convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 }; assert(w && h); shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); - shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); - shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); // Sub 64 to avoid overflow. // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. @@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); - d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); - d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); - d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); - d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 @@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, loadu_8bit_16x4(t, stride_hor, &s[4]); d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); - d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); - d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); - d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); - d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index da285be8e..bf44b0867 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -16,6 +16,8 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" // Zero fill 8 positions in the output buffer. static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { @@ -29,11 +31,13 @@ static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { } static VPX_FORCE_INLINE void load_fp_values_avx2( - const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, - __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { - *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->round_fp)); *round = _mm256_permute4x64_epi64(*round, 0x54); - *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_fp)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); @@ -98,13 +102,13 @@ static VPX_FORCE_INLINE void quantize_fp_16( } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -113,8 +117,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_setzero_si256(); quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, @@ -203,14 +206,13 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16( } void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -219,8 +221,7 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_srli_epi16(dequant, 2); quant = _mm256_slli_epi16(quant, 1); { @@ -286,16 +287,17 @@ static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { } static VPX_FORCE_INLINE void highbd_load_fp_values( - const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, - __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { - *round = highbd_init_256(round_ptr); - *quant = highbd_init_256(quant_ptr); + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(mb_plane->round_fp); + *quant = highbd_init_256(mb_plane->quant_fp); *dequant = highbd_init_256(dequant_ptr); } static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { - const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask = + _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); const __m256i packed_nz_mask_perm = _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = @@ -324,16 +326,15 @@ static VPX_FORCE_INLINE void highbd_quantize_fp( } void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int step = 8; __m256i round, quant, dequant; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -342,8 +343,7 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, iscan + n_coeffs, qcoeff_ptr + n_coeffs, @@ -390,14 +390,14 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( } void vp9_highbd_quantize_fp_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const int step = 8; __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -406,8 +406,7 @@ void vp9_highbd_quantize_fp_32x32_avx2( n_coeffs = -n_coeffs; // Setup global values - highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_srli_epi32(dequant, 2); // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when // calculating the zbin mask. diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index c87723443..2481eb366 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -17,12 +17,14 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; @@ -31,11 +33,10 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c index d35004e37..98decae74 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.c +++ b/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -17,12 +17,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; @@ -31,11 +33,10 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -119,12 +120,11 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m128i one_s16 = _mm_set1_epi16(1); __m128i thr; @@ -134,11 +134,10 @@ void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // The 32x32 halves round. round = _mm_add_epi16(round, one_s16); round = _mm_srli_epi16(round, 1); diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 02e50a857..fd81bce7b 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -25,22 +25,16 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create( VP9RateControlRTC()); if (!rc_api) return nullptr; rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_))); - if (!rc_api->cpi_) { - rc_api.reset(); - return nullptr; - } + if (!rc_api->cpi_) return nullptr; vp9_zero(*rc_api->cpi_); - rc_api->InitRateControl(cfg); + if (!rc_api->InitRateControl(cfg)) return nullptr; if (cfg.aq_mode) { VP9_COMP *const cpi = rc_api->cpi_; cpi->segmentation_map = static_cast<uint8_t *>( vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, sizeof(*cpi->segmentation_map))); - if (!cpi->segmentation_map) { - rc_api.reset(); - return nullptr; - } + if (!cpi->segmentation_map) return nullptr; cpi->cyclic_refresh = vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); cpi->cyclic_refresh->content_mode = 0; @@ -48,7 +42,30 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create( return rc_api; } -void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { +VP9RateControlRTC::~VP9RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } + } + } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } + vpx_free(cpi_); + } +} + +bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; @@ -65,7 +82,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { cm->current_video_frame = 0; rc->kf_boost = DEFAULT_KF_BOOST; - UpdateRateControl(rc_cfg); + if (!UpdateRateControl(rc_cfg)) return false; vp9_set_mb_mi(cm, cm->width, cm->height); cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 || @@ -79,10 +96,21 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { vp9_rc_init(oxcf, 0, rc); rc->constrain_gf_key_freq_onepass_vbr = 0; cpi_->sf.use_nonrd_pick_mode = 1; + return true; } -void VP9RateControlRTC::UpdateRateControl( +bool VP9RateControlRTC::UpdateRateControl( const VP9RateControlRtcConfig &rc_cfg) { + // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5) + // and VPX_TS_MAX_LAYERS (5), check all three. + if (rc_cfg.ss_number_layers < 1 || + rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS || + rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS || + rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) { + return false; + } + VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; @@ -102,6 +130,8 @@ void VP9RateControlRTC::UpdateRateControl( oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz; oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT; oxcf->ss_number_layers = rc_cfg.ss_number_layers; oxcf->ts_number_layers = rc_cfg.ts_number_layers; oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)( @@ -112,7 +142,19 @@ void VP9RateControlRTC::UpdateRateControl( cpi_->framerate = rc_cfg.framerate; cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; + vp9_set_mb_mi(cm, cm->width, cm->height); + + if (setjmp(cpi_->common.error.jmp)) { + cpi_->common.error.setjmp = 0; + vpx_clear_system_state(); + return false; + } + cpi_->common.error.setjmp = 1; + + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; + } for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { const int layer = @@ -126,21 +168,33 @@ void VP9RateControlRTC::UpdateRateControl( lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; - oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; } } vp9_set_rc_buffer_sizes(cpi_); vp9_new_framerate(cpi_, cpi_->framerate); if (cpi_->svc.number_temporal_layers > 1 || cpi_->svc.number_spatial_layers > 1) { - if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_); + if (cm->current_video_frame == 0) { + vp9_init_layer_context(cpi_); + // svc->framedrop_mode is not currently exposed, so only allow for + // full superframe drop for now. + cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP; + } vp9_update_layer_context_change_config(cpi_, (int)cpi_->oxcf.target_bandwidth); + cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop; } vp9_check_reset_rc_flag(cpi_); + + cpi_->common.error.setjmp = 0; + return true; } -void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { +// Compute the QP for the frame. If the frame is dropped this function +// returns kDrop, and no QP is computed. If the frame is encoded (not dropped) +// the QP is computed and kOk is returned. +FrameDropDecision VP9RateControlRTC::ComputeQP( + const VP9FrameParamsQpRTC &frame_params) { VP9_COMMON *const cm = &cpi_->common; int width, height; cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; @@ -157,7 +211,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cm->height = height; } vp9_set_mb_mi(cm, cm->width, cm->height); - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type); // This is needed to ensure key frame does not get unset in rc_get_svc_params. cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; @@ -192,11 +246,51 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { vp9_restore_layer_context(cpi_); vp9_rc_get_svc_params(cpi_); } + if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer); + // SVC: check for skip encoding of enhancement layer if the + // layer target bandwidth = 0. + if (vp9_svc_check_skip_enhancement_layer(cpi_)) + return FrameDropDecision::kDrop; + // Check for dropping this frame based on buffer level. + // Never drop on key frame, or if base layer is key for svc, + if (!frame_is_intra_only(cm) && + (!cpi_->use_svc || + !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi_)) { + // For FULL_SUPERFRAME_DROP mode (the only mode considered here): + // if the superframe drop is decided we need to save the layer context for + // all spatial layers, and call update_buffer_level and postencode_drop + // for all spatial layers. + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_save_layer_context(cpi_); + for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) { + cpi_->svc.spatial_layer_id = sl; + vp9_restore_layer_context(cpi_); + vp9_update_buffer_level_svc_preencode(cpi_); + vp9_rc_postencode_update_drop_frame(cpi_); + vp9_save_layer_context(cpi_); + } + } + return FrameDropDecision::kDrop; + } + } + // Compute the QP for the frame. int bottom_index, top_index; cpi_->common.base_qindex = vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); + + cpi_->last_frame_dropped = 0; + cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0; + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->svc.num_encoded_top_layer++; + + return FrameDropDecision::kOk; } int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } @@ -219,7 +313,31 @@ bool VP9RateControlRTC::GetSegmentationData( return true; } -void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { +void VP9RateControlRTC::PostEncodeUpdate( + uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) { + cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type); + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_restore_layer_context(cpi_); + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + cpi_->common.base_qindex = lc->frame_qp; + cpi_->common.MBs = lc->MBs; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial + // layers, for the base temporal layer. + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi_->svc.number_spatial_layers > 1 && + cpi_->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh; + cr->qindex_delta[0] = lc->qindex_delta[0]; + cr->qindex_delta[1] = lc->qindex_delta[1]; + cr->qindex_delta[2] = lc->qindex_delta[2]; + } + } vp9_rc_postencode_update(cpi_, encoded_frame_size); if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index b209e4db6..85005c547 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -14,22 +14,20 @@ #include <cstdint> #include <memory> -#include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_enums.h" -#include "vp9/common/vp9_onyxc_int.h" #include "vp9/vp9_iface_common.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" #include "vpx/internal/vpx_ratectrl_rtc.h" #include "vpx_mem/vpx_mem.h" -namespace libvpx { +struct VP9_COMP; +namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP9RateControlRtcConfig() { + ss_number_layers = 1; vp9_zero(max_quantizers); vp9_zero(min_quantizers); vp9_zero(scaling_factor_den); @@ -40,20 +38,21 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { scaling_factor_den[0] = 1; max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; + max_consec_drop = INT_MAX; } // Number of spatial layers int ss_number_layers; - // Number of temporal layers - int ts_number_layers; int max_quantizers[VPX_MAX_LAYERS]; int min_quantizers[VPX_MAX_LAYERS]; int scaling_factor_num[VPX_SS_MAX_LAYERS]; int scaling_factor_den[VPX_SS_MAX_LAYERS]; + // This is only for SVC for now. + int max_consec_drop; }; struct VP9FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int spatial_layer_id; int temporal_layer_id; }; @@ -69,63 +68,46 @@ struct VP9SegmentationData { // the encoder. To use this interface, you need to link with libvpxrc.a. // // #include "vp9/ratectrl_rtc.h" -// VP9RateControlRTC rc_api; // VP9RateControlRtcConfig cfg; // VP9FrameParamsQpRTC frame_params; // // YourFunctionToInitializeConfig(cfg); -// rc_api.InitRateControl(cfg); +// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg); // // start encoding // while (frame_to_encode) { // if (config_changed) -// rc_api.UpdateRateControl(cfg); +// rc_api->UpdateRateControl(cfg); // YourFunctionToFillFrameParams(frame_params); -// rc_api.ComputeQP(frame_params); -// YourFunctionToUseQP(rc_api.GetQP()); -// YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel()); +// rc_api->ComputeQP(frame_params); +// YourFunctionToUseQP(rc_api->GetQP()); +// YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel()); // // After encoding -// rc_api.PostEncode(encoded_frame_size); +// rc_api->PostEncode(encoded_frame_size, frame_params); // } class VP9RateControlRTC { public: static std::unique_ptr<VP9RateControlRTC> Create( const VP9RateControlRtcConfig &cfg); - ~VP9RateControlRTC() { - if (cpi_) { - if (cpi_->svc.number_spatial_layers > 1 || - cpi_->svc.number_temporal_layers > 1) { - for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { - for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { - int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); - LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; - vpx_free(lc->map); - vpx_free(lc->last_coded_q_map); - vpx_free(lc->consec_zero_mv); - } - } - } - if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - vpx_free(cpi_->segmentation_map); - cpi_->segmentation_map = NULL; - vp9_cyclic_refresh_free(cpi_->cyclic_refresh); - } - vpx_free(cpi_); - } - } + ~VP9RateControlRTC(); - void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); + bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; int GetLoopfilterLevel() const; bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; - void ComputeQP(const VP9FrameParamsQpRTC &frame_params); + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called (vp9_rc_postencode_update_drop_frame is already + // called via ComputeQP if drop is decided). + FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame - void PostEncodeUpdate(uint64_t encoded_frame_size); + void PostEncodeUpdate(uint64_t encoded_frame_size, + const VP9FrameParamsQpRTC &frame_params); private: VP9RateControlRTC() {} - void InitRateControl(const VP9RateControlRtcConfig &cfg); - VP9_COMP *cpi_; + bool InitRateControl(const VP9RateControlRtcConfig &cfg); + struct VP9_COMP *cpi_; }; } // namespace libvpx diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index f42912d35..2e6f9a451 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -143,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) { default: fprintf(stderr, "Unsupported update_type %d\n", update_type); abort(); - return kFrameTypeInter; } } @@ -183,10 +182,11 @@ static void update_motion_vector_info( const MV_REFERENCE_FRAME *in_ref_frame = input_motion_vector_info[i].ref_frame; output_motion_vector_info[i].mv_count = - (in_ref_frame[0] == INTRA_FRAME) ? 0 - : ((in_ref_frame[1] == NONE) ? 1 : 2); - if (in_ref_frame[0] == NONE) { - fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n"); + (in_ref_frame[0] == INTRA_FRAME) + ? 0 + : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2); + if (in_ref_frame[0] == NO_REF_FRAME) { + fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n"); abort(); } output_motion_vector_info[i].ref_frame[0] = diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 7920e95ee..d610a5e15 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -309,7 +309,7 @@ struct EncodeFrameResult { // The tpl stats stored in the vector is according to the encoding order. // For example, suppose there are N show frames for the current GOP. // Then tpl_stats_info[0] stores the information of the first frame to be - // encoded for this GOP, i.e, the AltRef frame. + // encoded for this GOP, i.e., the AltRef frame. std::vector<TplStatsInfo> tpl_stats_info; ImageBuffer coded_frame; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index dee175dc0..e738feda0 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -29,6 +29,8 @@ #include "vp9/vp9_cx_iface.h" #include "vp9/vp9_iface_common.h" +#include "vpx/vpx_tpl.h" + typedef struct vp9_extracfg { int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; @@ -129,6 +131,8 @@ struct vpx_codec_alg_priv { BufferPool *buffer_pool; }; +// Called by encoder_set_config() and encoder_encode() only. Must not be called +// by encoder_init(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { const vpx_codec_err_t res = error->error_code; @@ -635,8 +639,12 @@ static vpx_codec_err_t set_encoder_config( for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { - oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = - 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; + const int layer = sl * oxcf->ts_number_layers + tl; + if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000) + oxcf->layer_target_bitrate[layer] = INT_MAX; + else + oxcf->layer_target_bitrate[layer] = + 1000 * cfg->layer_target_bitrate[layer]; } } if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { @@ -789,10 +797,22 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) ERROR("Cannot change width or height after initialization"); - if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) || + // Note: function encoder_set_config() is allowed to be called multiple + // times. However, when the original frame width or height is less than two + // times of the new frame width or height, a forced key frame should be + // used. To make sure the correct detection of a forced key frame, we need + // to update the frame width and height only when the actual encoding is + // performed. cpi->last_coded_width and cpi->last_coded_height are used to + // track the actual coded frame size. + if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height && + !valid_ref_frame_size(ctx->cpi->last_coded_width, + ctx->cpi->last_coded_height, cfg->g_w, + cfg->g_h)) || (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || - (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + (ctx->cpi->initial_height && + (int)cfg->g_h > ctx->cpi->initial_height)) { force_key = 1; + } } // Prevent increasing lag_in_frames. This check is stricter than it needs @@ -813,6 +833,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, assert(codec_err != VPX_CODEC_OK); return codec_err; } + ctx->cpi->common.error.setjmp = 1; ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); @@ -1068,6 +1089,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, cpi->compute_frame_low_motion_onepass = 0; cpi->rc.constrain_gf_key_freq_onepass_vbr = 0; cpi->cyclic_refresh->content_mode = 0; + cpi->disable_scene_detection_rtc_ratectrl = 1; } return VPX_CODEC_OK; } @@ -1300,6 +1322,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + cpi->last_coded_width = ctx->oxcf.width; + cpi->last_coded_height = ctx->oxcf.height; + if (img != NULL) { res = validate_img(ctx, img); if (res == VPX_CODEC_OK) { @@ -1631,13 +1656,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, if (data) { vpx_roi_map_t *roi = (vpx_roi_map_t *)data; - - if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, - roi->delta_q, roi->delta_lf, roi->skip, - roi->ref_frame)) { - return VPX_CODEC_OK; - } - return VPX_CODEC_INVALID_PARAM; + return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame); } return VPX_CODEC_INVALID_PARAM; } @@ -1675,9 +1696,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); if (mode) { - const int res = - vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, - (VPX_SCALING)mode->v_scaling_mode); + const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode, + mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; } return VPX_CODEC_INVALID_PARAM; @@ -1933,16 +1953,28 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, const FRAME_INFO *frame_info = &cpi->frame_info; vpx_rc_config_t ratectrl_config; vpx_codec_err_t codec_status; + memset(&ratectrl_config, 0, sizeof(ratectrl_config)); ratectrl_config.frame_width = frame_info->frame_width; ratectrl_config.frame_height = frame_info->frame_height; ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; - + ratectrl_config.max_gf_interval = oxcf->max_gf_interval; + ratectrl_config.min_gf_interval = oxcf->min_gf_interval; // TODO(angiebird): Double check whether this is the proper way to set up // target_bitrate and frame_rate. ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); ratectrl_config.frame_rate_num = oxcf->g_timebase.den; ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + + if (oxcf->rc_mode == VPX_VBR) { + ratectrl_config.rc_mode = VPX_RC_VBR; + } else if (oxcf->rc_mode == VPX_Q) { + ratectrl_config.rc_mode = VPX_RC_QMODE; + } else if (oxcf->rc_mode == VPX_CQ) { + ratectrl_config.rc_mode = VPX_RC_CQ; + } codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); if (codec_status != VPX_CODEC_OK) { @@ -2065,8 +2097,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 0, // rc_resize_allowed 0, // rc_scaled_width 0, // rc_scaled_height - 60, // rc_resize_down_thresold - 30, // rc_resize_up_thresold + 60, // rc_resize_down_thresh + 30, // rc_resize_up_thresh VPX_VBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in @@ -2099,7 +2131,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // ts_rate_decimator 0, // ts_periodicity { 0 }, // ts_layer_id - { 0 }, // layer_taget_bitrate + { 0 }, // layer_target_bitrate 0, // temporal_layering_mode 0, // use_vizier_rc_params { 1, 1 }, // active_wq_factor diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index bdfe21793..a242c776c 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -256,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { } while (0) static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { + vpx_codec_err_t res; ctx->last_show_frame = -1; ctx->need_resync = 1; ctx->flushed = 0; @@ -265,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->pbi = vp9_decoder_create(ctx->buffer_pool); if (ctx->pbi == NULL) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; set_error_detail(ctx, "Failed to allocate decoder"); return VPX_CODEC_MEM_ERROR; } @@ -282,7 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) set_default_ppflags(&ctx->postproc_cfg); - return init_buffer_callbacks(ctx); + res = init_buffer_callbacks(ctx); + if (res != VPX_CODEC_OK) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + vp9_decoder_remove(ctx->pbi); + ctx->pbi = NULL; + } + return res; } static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, @@ -348,7 +358,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Initialize the decoder on the first frame. if (ctx->pbi == NULL) { - const vpx_codec_err_t res = init_decoder(ctx); + res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -367,7 +377,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, for (i = 0; i < frame_count; ++i) { const uint8_t *data_start_copy = data_start; const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { set_error_detail(ctx, "Invalid frame size in index"); return VPX_CODEC_CORRUPT_FRAME; @@ -382,8 +391,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); + res = decode_one(ctx, &data_start, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; // Account for suboptimal termination by the encoder. diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 9072628f2..44790ef6a 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h VP9_CX_SRCS-yes += encoder/vp9_encodemv.h VP9_CX_SRCS-yes += encoder/vp9_extend.h VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c VP9_CX_SRCS-yes += encoder/vp9_job_queue.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c @@ -104,20 +105,24 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c -VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm @@ -134,11 +139,12 @@ endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c -endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c +endif VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c @@ -156,8 +162,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c -VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c diff --git a/vpx/exports_com b/vpx/exports_com index 2ab05099f..f0b46aa17 100644 --- a/vpx/exports_com +++ b/vpx/exports_com @@ -14,3 +14,6 @@ text vpx_img_flip text vpx_img_free text vpx_img_set_rect text vpx_img_wrap +text vpx_free_tpl_gop_stats +text vpx_read_tpl_gop_stats +text vpx_write_tpl_gop_stats diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 670fe380e..aae321873 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -48,6 +48,8 @@ #include "../vpx_encoder.h" #include <stdarg.h> +#include "vpx_config.h" + #ifdef __cplusplus extern "C" { #endif @@ -427,6 +429,27 @@ struct vpx_internal_error_info { jmp_buf jmp; }; +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + #define CLANG_ANALYZER_NORETURN #if defined(__has_feature) #if __has_feature(attribute_analyzer_noreturn) diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index 65398c654..01d64b14b 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -14,6 +14,14 @@ #include "vpx/vpx_encoder.h" namespace libvpx { + +enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; + +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + struct VpxRateControlRtcConfig { public: VpxRateControlRtcConfig() { @@ -34,6 +42,8 @@ struct VpxRateControlRtcConfig { aq_mode = 0; layer_target_bitrate[0] = static_cast<int>(target_bandwidth); ts_rate_decimator[0] = 1; + frame_drop_thresh = 0; + is_screen = false; } int width; @@ -57,6 +67,8 @@ struct VpxRateControlRtcConfig { // vbr, cbr enum vpx_rc_mode rc_mode; int aq_mode; + int frame_drop_thresh; + bool is_screen; }; } // namespace libvpx #endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c index 114b94e19..24528d860 100644 --- a/vpx/src/vpx_codec.c +++ b/vpx/src/vpx_codec.c @@ -50,12 +50,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) { return "Unrecognized error code"; } -const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) { return (ctx) ? vpx_codec_err_to_string(ctx->err) : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); } -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) { if (ctx && ctx->err) return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; @@ -82,7 +82,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { } vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { - return (iface) ? iface->caps : 0; + return iface ? iface->caps : 0; } vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 846638fe5..0d6e48015 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = ctx->iface->init(ctx, NULL); if (res) { + // IMPORTANT: ctx->priv->err_detail must be null or point to a string + // that remains valid after ctx->priv is destroyed, such as a C string + // literal. This makes it safe to call vpx_codec_error_detail() after + // vpx_codec_enc_init_ver() failed. ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; vpx_codec_destroy(ctx); } diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c new file mode 100644 index 000000000..62c2a9c85 --- /dev/null +++ b/vpx/src/vpx_tpl.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_tpl.h" +#include "vpx_mem/vpx_mem.h" + +#define CHECK_FPRINTF_ERROR(expr) \ + do { \ + if (expr < 0) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +#define CHECK_FSCANF_ERROR(expr, expected_value) \ + do { \ + if (expr != expected_value) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats) { + int i; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size)); + + for (i = 0; i < tpl_gop_stats->size; i++) { + VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats.num_blocks; + int block; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width, + frame_stats.frame_height, num_blocks)); + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats block_stats = frame_stats.block_stats_list[block]; + CHECK_FPRINTF_ERROR( + fprintf(tpl_file, + "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 + " %" PRId64 " %d\n", + block_stats.inter_cost, block_stats.intra_cost, + block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, + block_stats.recrf_rate, block_stats.ref_frame_index)); + } + } + + return VPX_CODEC_OK; +} + +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats) { + int i, frame_list_size; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1); + tpl_gop_stats->size = frame_list_size; + tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc( + frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0])); + if (tpl_gop_stats->frame_stats_list == NULL) { + return VPX_CODEC_MEM_ERROR; + } + for (i = 0; i < frame_list_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + int num_blocks, width, height, block; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3); + frame_stats->num_blocks = num_blocks; + frame_stats->frame_width = width; + frame_stats->frame_height = height; + frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc( + num_blocks, sizeof(frame_stats->block_stats_list[0])); + if (frame_stats->block_stats_list == NULL) { + vpx_free_tpl_gop_stats(tpl_gop_stats); + return VPX_CODEC_MEM_ERROR; + } + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block]; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, + "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64 + " %" SCNd64 " %d\n", + &block_stats->inter_cost, &block_stats->intra_cost, + &block_stats->mv_c, &block_stats->mv_r, + &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->ref_frame_index), + 7); + } + } + + return VPX_CODEC_OK; +} + +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) { + int frame; + if (tpl_gop_stats == NULL) return; + for (frame = 0; frame < tpl_gop_stats->size; frame++) { + vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index e0b679fbb..2875e185e 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -166,6 +166,7 @@ enum vp8e_enc_control_id { * * \note Valid range for VP8: -16..16 * \note Valid range for VP9: -9..9 + * \note A negative value (-n) is treated as its absolute value (n) in VP9. * * Supported in codecs: VP8, VP9 */ @@ -302,7 +303,7 @@ enum vp8e_enc_control_id { * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * For example, to allow 100% more bits, i.e., 2X, in a golden frame * than average frame, set this to 100. * * Supported in codecs: VP9 @@ -598,7 +599,7 @@ enum vp8e_enc_control_id { * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * For example, to allow 100% more bits, i.e., 2X, in a golden frame * than average frame, set this to 100. * * Supported in codecs: VP8 diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index b0a931e01..0d61b0738 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -318,19 +318,21 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err); * \param[in] ctx Pointer to this instance's context. * */ -const char *vpx_codec_error(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx); /*!\brief Retrieve detailed error information for codec context * * Returns a human readable string providing detailed information about - * the last error. + * the last error. The returned string is only valid until the next + * vpx_codec_* function call (except vpx_codec_error and + * vpx_codec_error_detail) on the codec context. * * \param[in] ctx Pointer to this instance's context. * * \retval NULL * No detailed information is available. */ -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx); /* REQUIRED FUNCTIONS * @@ -345,9 +347,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); * \param[in] ctx Pointer to this instance's context * * \retval #VPX_CODEC_OK - * The codec algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. + * The codec instance has been destroyed. + * \retval #VPX_CODEC_INVALID_PARAM + * ctx is a null pointer. + * \retval #VPX_CODEC_ERROR + * Codec context not initialized. */ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index de86579d5..25c815ef5 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h API_DOC_SRCS-yes += vpx_ext_ratectrl.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h +API_DOC_SRCS-yes += vpx_tpl.h API_SRCS-yes += src/vpx_decoder.c API_SRCS-yes += vpx_decoder.h @@ -36,9 +37,11 @@ API_SRCS-yes += internal/vpx_codec_internal.h API_SRCS-yes += internal/vpx_ratectrl_rtc.h API_SRCS-yes += src/vpx_codec.c API_SRCS-yes += src/vpx_image.c +API_SRCS-yes += src/vpx_tpl.c API_SRCS-yes += vpx_codec.h API_SRCS-yes += vpx_codec.mk API_SRCS-yes += vpx_frame_buffer.h API_SRCS-yes += vpx_image.h API_SRCS-yes += vpx_integer.h API_SRCS-yes += vpx_ext_ratectrl.h +API_SRCS-yes += vpx_tpl.h diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index 39e5f585f..99dd8cf69 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -127,7 +127,7 @@ typedef struct vpx_codec_dec_cfg { * \param[in] ver ABI version number. Must be set to * VPX_DECODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The decoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index efaf5ef36..c45d1a2ba 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -31,6 +31,7 @@ extern "C" { #include "./vpx_codec.h" #include "./vpx_ext_ratectrl.h" +#include "./vpx_tpl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,9 +58,9 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_ENCODER_ABI_VERSION \ - (15 + VPX_CODEC_ABI_VERSION + \ - VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION \ + (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ + VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -858,7 +859,7 @@ typedef struct vpx_svc_parameters { /*!\brief Initialize an encoder instance * - * Initializes a encoder context using the given interface. Applications + * Initializes an encoder context using the given interface. Applications * should call the vpx_codec_enc_init convenience macro instead of this * function directly, to ensure that the ABI version number parameter * is properly initialized. @@ -867,6 +868,9 @@ typedef struct vpx_svc_parameters { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * + * If vpx_codec_enc_init_ver() fails, it is not necessary to call + * vpx_codec_destroy() on the encoder context. + * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. @@ -906,7 +910,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, * \param[in] ver ABI version number. Must be set to * VPX_ENCODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The encoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. */ diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 3c5fc8cfc..46d290dff 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -16,6 +16,7 @@ extern "C" { #endif #include "./vpx_integer.h" +#include "./vpx_tpl.h" /*!\brief Current ABI version number * @@ -25,7 +26,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (6) +#define VPX_EXT_RATECTRL_ABI_VERSION (7) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -47,6 +48,14 @@ typedef enum vpx_rc_type { VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT } vpx_rc_type_t; +/*!\brief The rate control mode for the external rate control model. + */ +typedef enum vpx_ext_rc_mode { + VPX_RC_QMODE = 0, + VPX_RC_VBR = 1, + VPX_RC_CQ = 2, +} vpx_ext_rc_mode_t; + /*!\brief Abstract rate control model handler * * The encoder will receive the model handler from create_model() defined in @@ -271,6 +280,10 @@ typedef struct vpx_rc_frame_stats { * number of frames whose stats are accumulated. */ double count; + /*! + * Number of new mv in a frame. + */ + double new_mv_count; } vpx_rc_frame_stats_t; /*!\brief Collection of first pass frame stats @@ -294,12 +307,21 @@ typedef struct vpx_rc_config { int frame_width; /**< frame width */ int frame_height; /**< frame height */ int show_frame_count; /**< number of visible frames in the video */ + int max_gf_interval; /**< max GOP size in number of show frames */ + int min_gf_interval; /**< min GOP size in number of show frames */ /*! * Target bitrate in kilobytes per second */ int target_bitrate_kbps; int frame_rate_num; /**< numerator of frame rate */ int frame_rate_den; /**< denominator of frame rate */ + /*! + * The following fields are only for external rate control models that support + * different rate control modes. + */ + vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ + int overshoot_percent; /**< for VBR mode only */ + int undershoot_percent; /**< for VBR mode only */ } vpx_rc_config_t; /*!\brief Information passed to the external rate control model to @@ -385,13 +407,13 @@ typedef struct vpx_rc_gop_decision { * This callback is invoked by the encoder to create an external rate control * model. * - * \param[in] priv Callback's private data - * \param[in] ratectrl_config Pointer to vpx_rc_config_t - * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t + * \param[in] priv Callback's private data + * \param[in] ratectrl_config Pointer to vpx_rc_config_t + * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t */ typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)( void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt); + vpx_rc_model_t *rate_ctrl_model_ptr); /*!\brief Send first pass stats to the external rate control model callback * prototype @@ -406,6 +428,18 @@ typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats); +/*!\brief Send TPL stats for the current GOP to the external rate control model + * callback prototype + * + * This callback is invoked by the encoder to send TPL stats for the GOP to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] tpl_gop_stats TPL stats for current GOP + */ +typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats); + /*!\brief Receive encode frame decision callback prototype * * This callback is invoked by the encoder to receive encode frame decision from @@ -488,6 +522,10 @@ typedef struct vpx_rc_funcs { */ vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; /*! + * Send TPL stats for current GOP to the external rate control model. + */ + vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats; + /*! * Get encodeframe decision from the external rate control model. */ vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision; diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h new file mode 100644 index 000000000..a250aada6 --- /dev/null +++ b/vpx/vpx_tpl.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the TPL stats descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_TPL_H_ +#define VPX_VPX_VPX_TPL_H_ + +#include <stdio.h> + +#include "./vpx_integer.h" +#include "./vpx_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/ + +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct VpxTplBlockStats { + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t recrf_rate; /**< Rate from reconstructed ref frame */ + int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ +} VpxTplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct VpxTplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + int num_blocks; /**< Number of blocks. Size of block_stats_list */ + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; + +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + +/*!\brief Write VpxTplGopStats to file + * + * Accepts an opened file handle and writes \p tpl_gop_stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully written. + */ +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Read VpxTplGopStats from file + * + * Accepts an opened file handle and reads TPL stats and stores them into + * \p tpl_gop_stats. Allocates memory for TPL stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully read from file. + */ +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats); + +/*!\brief Free the memory allocated for VpxTplGopStats + * + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + */ +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_TPL_H_ diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 8e57bdaa5..1b17a326b 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -46,96 +46,93 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. +// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] int vpx_satd_neon(const tran_low_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { - const int16x8_t src0 = load_tran_low_to_s16q(coeff); - const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); + int16x8_t abs0, abs1; + const int16x8_t s0 = load_tran_low_to_s16q(coeff); + const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8); + + abs0 = vabsq_s16(s0); + sum_s32[0] = vpadalq_s16(sum_s32[0], abs0); + abs1 = vabsq_s16(s1); + sum_s32[1] = vpadalq_s16(sum_s32[1], abs1); + length -= 16; coeff += 16; } while (length != 0); - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } + return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); } void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); + uint8x16_t r0, r1, r2, r3; + uint16x8_t sum_lo[2], sum_hi[2]; + uint16x8_t tmp_lo[2], tmp_hi[2]; + int16x8_t avg_lo, avg_hi; - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); + const int norm_factor = (height >> 5) + 3; + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); + assert(height >= 4 && height % 4 == 0); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); + ref += 4 * ref_stride; - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); + for (i = 4; i < height; i += 4) { + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); + tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); + sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; + ref += 4 * ref_stride; } - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); + + avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); + vst1q_s16(hbuf, avg_lo); + vst1q_s16(hbuf + 8, avg_hi); } int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); } - return (int16_t)horizontal_add_uint16x8(vec_sum); + return (int16_t)horizontal_add_uint16x8(sum); } // ref, src = [0, 510] - max diff = 16-bits @@ -214,11 +211,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - // Split to D and start doing pairwise. +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - // Enough runs of vpmax/min propogate the max/min values to every position. + // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); @@ -232,4 +234,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, // Store directly to avoid costly neon->gpr transfer. vst1_lane_u8((uint8_t *)max, ab_max, 0); vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif } diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index a458ecaa4..fde71ff30 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -28,6 +28,124 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #else +// Main body of fdct16x16. +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], + &out[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], + &out[15]); + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], + &out[3]); + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], + &out[11]); +} + void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; int16x8_t temp1[16]; @@ -47,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_s16_8x8_new(&temp0[0], &temp2[0]); - transpose_s16_8x8_new(&temp1[0], &temp2[8]); + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); vpx_fdct8x16_body(temp3, temp2); @@ -62,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8q(&temp0[8], &temp1[0]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); @@ -79,6 +197,194 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #if CONFIG_VP9_HIGHBITDEPTH +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 43d820b6b..cd58675ca 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -159,124 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); } -// Main body of fdct16x16. -static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { - int16x8_t s[8]; - int16x8_t x[4]; - int16x8_t step[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - s[0] = vaddq_s16(in[0], in[7]); - s[1] = vaddq_s16(in[1], in[6]); - s[2] = vaddq_s16(in[2], in[5]); - s[3] = vaddq_s16(in[3], in[4]); - s[4] = vsubq_s16(in[3], in[4]); - s[5] = vsubq_s16(in[2], in[5]); - s[6] = vsubq_s16(in[1], in[6]); - s[7] = vsubq_s16(in[0], in[7]); - - // fdct4(step, step); - x[0] = vaddq_s16(s[0], s[3]); - x[1] = vaddq_s16(s[1], s[2]); - x[2] = vsubq_s16(s[1], s[2]); - x[3] = vsubq_s16(s[0], s[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], - &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); - - // Stage 3 - x[0] = vaddq_s16(s[4], s[5]); - x[1] = vsubq_s16(s[4], s[5]); - x[2] = vsubq_s16(s[7], s[6]); - x[3] = vaddq_s16(s[7], s[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); - - // step 3 - s[0] = vaddq_s16(in[8], s[3]); - s[1] = vaddq_s16(in[9], s[2]); - x[0] = vsubq_s16(in[9], s[2]); - x[1] = vsubq_s16(in[8], s[3]); - x[2] = vsubq_s16(in[15], s[4]); - x[3] = vsubq_s16(in[14], s[5]); - s[6] = vaddq_s16(in[14], s[5]); - s[7] = vaddq_s16(in[15], s[4]); - - // step 4 - // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * - // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] - // * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); - - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * - // cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); - - // step 5 - step[0] = vaddq_s16(s[0], s[1]); - step[1] = vsubq_s16(s[0], s[1]); - step[2] = vaddq_s16(x[1], s[2]); - step[3] = vsubq_s16(x[1], s[2]); - step[4] = vsubq_s16(x[2], s[5]); - step[5] = vaddq_s16(x[2], s[5]); - step[6] = vsubq_s16(s[7], s[6]); - step[7] = vaddq_s16(s[7], s[6]); - - // step 6 - // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) - // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) - butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], - &out[7]); - // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) - // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) - butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], - &out[15]); - - // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) - // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) - butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], - &out[3]); - - // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) - // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) - butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], - &out[11]); -} - #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, @@ -431,194 +313,6 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { vst1q_s32(a, b[15]); } -// Main body of fdct8x16 column -static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, - int32x4_t *right /* [16] */) { - int32x4_t sl[8]; - int32x4_t sr[8]; - int32x4_t xl[4]; - int32x4_t xr[4]; - int32x4_t inl[8]; - int32x4_t inr[8]; - int32x4_t stepl[8]; - int32x4_t stepr[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - sl[0] = vaddq_s32(left[0], left[7]); - sr[0] = vaddq_s32(right[0], right[7]); - sl[1] = vaddq_s32(left[1], left[6]); - sr[1] = vaddq_s32(right[1], right[6]); - sl[2] = vaddq_s32(left[2], left[5]); - sr[2] = vaddq_s32(right[2], right[5]); - sl[3] = vaddq_s32(left[3], left[4]); - sr[3] = vaddq_s32(right[3], right[4]); - sl[4] = vsubq_s32(left[3], left[4]); - sr[4] = vsubq_s32(right[3], right[4]); - sl[5] = vsubq_s32(left[2], left[5]); - sr[5] = vsubq_s32(right[2], right[5]); - sl[6] = vsubq_s32(left[1], left[6]); - sr[6] = vsubq_s32(right[1], right[6]); - sl[7] = vsubq_s32(left[0], left[7]); - sr[7] = vsubq_s32(right[0], right[7]); - - // Copy values 8-15 as we're storing in-place - inl[0] = left[8]; - inr[0] = right[8]; - inl[1] = left[9]; - inr[1] = right[9]; - inl[2] = left[10]; - inr[2] = right[10]; - inl[3] = left[11]; - inr[3] = right[11]; - inl[4] = left[12]; - inr[4] = right[12]; - inl[5] = left[13]; - inr[5] = right[13]; - inl[6] = left[14]; - inr[6] = right[14]; - inl[7] = left[15]; - inr[7] = right[15]; - - // fdct4(step, step); - xl[0] = vaddq_s32(sl[0], sl[3]); - xr[0] = vaddq_s32(sr[0], sr[3]); - xl[1] = vaddq_s32(sl[1], sl[2]); - xr[1] = vaddq_s32(sr[1], sr[2]); - xl[2] = vsubq_s32(sl[1], sl[2]); - xr[2] = vsubq_s32(sr[1], sr[2]); - xl[3] = vsubq_s32(sl[0], sl[3]); - xr[3] = vsubq_s32(sr[0], sr[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, - &left[0], &right[0], &left[8], &right[8]); - - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, - cospi_24_64, &left[4], &right[4], - &left[12], &right[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], - &sr[6], &sl[5], &sr[5]); - - // Stage 3 - xl[0] = vaddq_s32(sl[4], sl[5]); - xr[0] = vaddq_s32(sr[4], sr[5]); - xl[1] = vsubq_s32(sl[4], sl[5]); - xr[1] = vsubq_s32(sr[4], sr[5]); - xl[2] = vsubq_s32(sl[7], sl[6]); - xr[2] = vsubq_s32(sr[7], sr[6]); - xl[3] = vaddq_s32(sl[7], sl[6]); - xr[3] = vaddq_s32(sr[7], sr[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) - butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, - cospi_28_64, &left[2], &right[2], - &left[14], &right[14]); - // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) - butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, - cospi_12_64, &left[10], &right[10], - &left[6], &right[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, - &sl[5], &sr[5], &sl[2], &sr[2]); - butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, - &sl[4], &sr[4], &sl[3], &sr[3]); - - // step 3 - sl[0] = vaddq_s32(inl[0], sl[3]); - sr[0] = vaddq_s32(inr[0], sr[3]); - sl[1] = vaddq_s32(inl[1], sl[2]); - sr[1] = vaddq_s32(inr[1], sr[2]); - xl[0] = vsubq_s32(inl[1], sl[2]); - xr[0] = vsubq_s32(inr[1], sr[2]); - xl[1] = vsubq_s32(inl[0], sl[3]); - xr[1] = vsubq_s32(inr[0], sr[3]); - xl[2] = vsubq_s32(inl[7], sl[4]); - xr[2] = vsubq_s32(inr[7], sr[4]); - xl[3] = vsubq_s32(inl[6], sl[5]); - xr[3] = vsubq_s32(inr[6], sr[5]); - sl[6] = vaddq_s32(inl[6], sl[5]); - sr[6] = vaddq_s32(inr[6], sr[5]); - sl[7] = vaddq_s32(inl[7], sl[4]); - sr[7] = vaddq_s32(inr[7], sr[4]); - - // step 4 - // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * - // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] - // * cospi_8_64) - butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, - cospi_24_64, &sl[6], &sr[6], &sl[1], - &sr[1]); - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * - // cospi_24_64) - butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, - cospi_8_64, &sl[2], &sr[2], &sl[5], - &sr[5]); - - // step 5 - stepl[0] = vaddq_s32(sl[0], sl[1]); - stepr[0] = vaddq_s32(sr[0], sr[1]); - stepl[1] = vsubq_s32(sl[0], sl[1]); - stepr[1] = vsubq_s32(sr[0], sr[1]); - stepl[2] = vaddq_s32(xl[1], sl[2]); - stepr[2] = vaddq_s32(xr[1], sr[2]); - stepl[3] = vsubq_s32(xl[1], sl[2]); - stepr[3] = vsubq_s32(xr[1], sr[2]); - stepl[4] = vsubq_s32(xl[2], sl[5]); - stepr[4] = vsubq_s32(xr[2], sr[5]); - stepl[5] = vaddq_s32(xl[2], sl[5]); - stepr[5] = vaddq_s32(xr[2], sr[5]); - stepl[6] = vsubq_s32(sl[7], sl[6]); - stepr[6] = vsubq_s32(sr[7], sr[6]); - stepl[7] = vaddq_s32(sl[7], sl[6]); - stepr[7] = vaddq_s32(sr[7], sr[6]); - - // step 6 - // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) - // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) - butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], - cospi_18_64, cospi_14_64, &left[9], - &right[9], &left[7], &right[7]); - // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) - // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) - butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], - cospi_2_64, cospi_30_64, &left[1], - &right[1], &left[15], &right[15]); - // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) - // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) - butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], - cospi_26_64, cospi_6_64, &left[13], - &right[13], &left[3], &right[3]); - // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) - // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) - butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], - cospi_10_64, cospi_22_64, &left[5], - &right[5], &left[11], &right[11]); -} - #endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index d6818d2ec..a91730ce8 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c index 3b9196fae..4bc968ecb 100644 --- a/vpx_dsp/arm/fdct4x4_neon.c +++ b/vpx_dsp/arm/fdct4x4_neon.c @@ -52,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - static const int32x4_t const_1000 = { 1, 0, 0, 0 }; const int32x4_t const_one = vdupq_n_s32(1); // input[M * stride] * 16 @@ -64,7 +63,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, // If the very first value != 0, then add 1. if (input[0] != 0) { - in[0] = vaddq_s32(in[0], const_1000); + static const int32_t k1000[4] = { 1, 0, 0, 0 }; + in[0] = vaddq_s32(in[0], vld1q_s32(k1000)); } vpx_highbd_fdct4x4_pass1_neon(in); diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h index d8fa60044..cc6515743 100644 --- a/vpx_dsp/arm/fdct8x8_neon.h +++ b/vpx_dsp/arm/fdct8x8_neon.h @@ -293,88 +293,14 @@ static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, int32x4_t *right) { - int32x4x2_t out[8]; vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; + transpose_s32_8x8_2(left, right, left, right); } static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, int32x4_t *right) { - int32x4x2_t out[8]; vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; + transpose_s32_8x8_2(left, right, left, right); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 193594e3d..16f5c5fc0 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -177,6 +177,45 @@ static INLINE void butterfly_one_coeff_s32_fast( *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); } +// fdct_round_shift((a +/- b) * c) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_one_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac holds the following values: + // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c, + // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c + int64x2_t ac[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant); + ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant); + ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant); + ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant); + + sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant); + sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant); + sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant); + sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant); + diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant); + diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant); + diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + // fdct_round_shift(a * c1 +/- b * c2) // Variant that performs normal implementation on half vector // more accurate does 64-bit processing, takes and returns 32-bit values @@ -207,6 +246,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow_half( // fdct_round_shift(a * c1 +/- b * c2) // Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 64-bit values +// returns results without rounding +static INLINE void butterfly_two_coeff_s32_s64_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/, + int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/, + int64x2_t *sub_hi /*[2]*/) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + + sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector // more accurate does 64-bit processing, takes and returns 32-bit values // returns narrowed results static INLINE void butterfly_two_coeff_s32_s64_narrow( @@ -420,4 +497,46 @@ static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); } +static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vaddq_s64(a[0], b[0]); + result[1] = vaddq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vsubq_s64(a[0], b[0]); + result[1] = vsubq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vaddq_s64(a64[0], b64[0]); + result[1] = vaddq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vsubq_s64(a64[0], b64[0]); + result[1] = vsubq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 718dba0d9..df0da543c 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[1] = 0; } +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// will fail with an internal compiler error. +// See: +// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { int r; int16x8_t sum = vld1q_s16(&input[0]); @@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[0] = (tran_low_t)horizontal_add_int16x8(sum); output[1] = 0; } +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride) { diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index f6b6d7e3c..f5a044be4 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -138,15 +138,15 @@ void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); - const int16x8_t b0 = vhaddq_s16(a0, a1); - const int16x8_t b1 = vhsubq_s16(a0, a1); - const int16x8_t b2 = vhaddq_s16(a2, a3); - const int16x8_t b3 = vhsubq_s16(a2, a3); + const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1); + const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1); + const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1); + const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1); - const int16x8_t c0 = vhaddq_s16(b0, b2); - const int16x8_t c1 = vhaddq_s16(b1, b3); - const int16x8_t c2 = vhsubq_s16(b0, b2); - const int16x8_t c3 = vhsubq_s16(b1, b3); + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); store_s16q_to_tran_low(coeff + 0, c0); store_s16q_to_tran_low(coeff + 256, c1); diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c new file mode 100644 index 000000000..4265596c8 --- /dev/null +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p); + return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; +} + +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + +// coeff: 32 bits, dynamic range [-2147483648, 2147483647]. +// length: value range {16, 64, 256, 1024}. +// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { + int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int32x4_t abs0, abs1; + const int32x4_t s0 = load_tran_low_to_s32q(coeff); + const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4); + + abs0 = vabsq_s32(s0); + sum_s64[0] = vpadalq_s32(sum_s64[0], abs0); + abs1 = vabsq_s32(s1); + sum_s64[1] = vpadalq_s32(sum_s64[1], abs1); + + length -= 8; + coeff += 8; + } while (length != 0); + + return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); +} + +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint16_t *)max) = vmaxvq_u16(max07); + *((uint16_t *)min) = vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 000000000..3063acbb3 --- /dev/null +++ b/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } +} diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c new file mode 100644 index 000000000..7be88f6bc --- /dev/null +++ b/vpx_dsp/arm/highbd_hadamard_neon.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + int16x8_t b0 = vaddq_s16(*a0, *a1); + int16x8_t b1 = vsubq_s16(*a0, *a1); + int16x8_t b2 = vaddq_s16(*a2, *a3); + int16x8_t b3 = vsubq_s16(*a2, *a3); + int16x8_t b4 = vaddq_s16(*a4, *a5); + int16x8_t b5 = vsubq_s16(*a4, *a5); + int16x8_t b6 = vaddq_s16(*a6, *a7); + int16x8_t b7 = vsubq_s16(*a6, *a7); + + int16x8_t c0 = vaddq_s16(b0, b2); + int16x8_t c2 = vsubq_s16(b0, b2); + int16x8_t c1 = vaddq_s16(b1, b3); + int16x8_t c3 = vsubq_s16(b1, b3); + int16x8_t c4 = vaddq_s16(b4, b6); + int16x8_t c6 = vsubq_s16(b4, b6); + int16x8_t c5 = vaddq_s16(b5, b7); + int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a2 = vsubq_s16(c0, c4); + *a7 = vaddq_s16(c1, c5); + *a6 = vsubq_s16(c1, c5); + *a3 = vaddq_s16(c2, c6); + *a1 = vsubq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); +} + +static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, + int16x4_t a2, int16x4_t a3, + int16x4_t a4, int16x4_t a5, + int16x4_t a6, int16x4_t a7, + tran_low_t *coeff) { + int32x4_t b0 = vaddl_s16(a0, a1); + int32x4_t b1 = vsubl_s16(a0, a1); + int32x4_t b2 = vaddl_s16(a2, a3); + int32x4_t b3 = vsubl_s16(a2, a3); + int32x4_t b4 = vaddl_s16(a4, a5); + int32x4_t b5 = vsubl_s16(a4, a5); + int32x4_t b6 = vaddl_s16(a6, a7); + int32x4_t b7 = vsubl_s16(a6, a7); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c3 = vsubq_s32(b1, b3); + int32x4_t c4 = vaddq_s32(b4, b6); + int32x4_t c6 = vsubq_s32(b4, b6); + int32x4_t c5 = vaddq_s32(b5, b7); + int32x4_t c7 = vsubq_s32(b5, b7); + + int32x4_t d0 = vaddq_s32(c0, c4); + int32x4_t d2 = vsubq_s32(c0, c4); + int32x4_t d7 = vaddq_s32(c1, c5); + int32x4_t d6 = vsubq_s32(c1, c5); + int32x4_t d3 = vaddq_s32(c2, c6); + int32x4_t d1 = vsubq_s32(c2, c6); + int32x4_t d4 = vaddq_s32(c3, c7); + int32x4_t d5 = vsubq_s32(c3, c7); + + store_s32q_to_tran_low(coeff + 0, d0); + store_s32q_to_tran_low(coeff + 4, d1); + store_s32q_to_tran_low(coeff + 8, d2); + store_s32q_to_tran_low(coeff + 12, d3); + store_s32q_to_tran_low(coeff + 16, d4); + store_s32q_to_tran_low(coeff + 20, d5); + store_s32q_to_tran_low(coeff + 24, d6); + store_s32q_to_tran_low(coeff + 28, d7); +} + +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; + + int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); + int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); + int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); + + // For the first pass we can stay in 16-bit elements (4095*8 = 32760). + hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + // For the second pass we need to widen to 32-bit elements, so we're + // processing 4 columns at a time. + // Skip the second transpose because it is not required. + + b0 = vget_low_s16(s0); + b1 = vget_low_s16(s1); + b2 = vget_low_s16(s2); + b3 = vget_low_s16(s3); + b4 = vget_low_s16(s4); + b5 = vget_low_s16(s5); + b6 = vget_low_s16(s6); + b7 = vget_low_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); + + b0 = vget_high_s16(s0); + b1 = vget_high_s16(s1); + b2 = vget_high_s16(s2); + b3 = vget_high_s16(s3); + b4 = vget_high_s16(s4); + b5 = vget_high_s16(s5); + b6 = vget_high_s16(s6); + b7 = vget_high_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); +} + +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 16x16 to 8x32 and remove stride. + // Top left first. + vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); + // Bottom left. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, + coeff + 128); + // Bottom right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, + coeff + 192); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 64, c1); + store_s32q_to_tran_low(coeff + 4 * i + 128, c2); + store_s32q_to_tran_low(coeff + 4 * i + 192, c3); + } while (++i < 16); +} + +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 32x32 to 16x64 and remove stride. + // Top left first. + vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); + // Bottom left. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, + coeff + 512); + // Bottom right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, + coeff + 768); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768); + + int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 256, c1); + store_s32q_to_tran_low(coeff + 4 * i + 512, c2); + store_s32q_to_tran_low(coeff + 4 * i + 768, c3); + } while (++i < 64); +} diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 6f7e5da76..235cb5b99 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -12,23 +12,22 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { +static INLINE uint16_t dc_sum_4(const uint16_t *ref) { const uint16x4_t ref_u16 = vld1_u16(ref); - const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); - return vpadd_u16(p0, p0); + return horizontal_add_uint16x4(ref_u16); } static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_u16(dst, dc_dup); + vst1_u16(dst, dc); } } @@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16x4_t a = vld1_u16(above); const uint16x4_t l = vld1_u16(left); - uint16x4_t sum; - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l)); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3); (void)bd; - sum = vadd_u16(a, l); - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 3); dc_store_4x4(dst, stride, dc); } void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(left); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)above; (void)bd; dc_store_4x4(dst, stride, dc); @@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(above); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)left; (void)bd; dc_store_4x4(dst, stride, dc); @@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { +static INLINE uint16_t dc_sum_8(const uint16_t *ref) { const uint16x8_t ref_u16 = vld1q_u16(ref); - uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + return horizontal_add_uint16x8(ref_u16); } static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1q_u16(dst, dc_dup); + vst1q_u16(dst, dc); } } @@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t above_u16 = vld1q_u16(above); const uint16x8_t left_u16 = vld1q_u16(left); const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x8(p0); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)bd; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 4); dc_store_8x8(dst, stride, dc); } void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(left); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)above; (void)bd; dc_store_8x8(dst, stride, dc); @@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(above); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)left; (void)bd; dc_store_8x8(dst, stride, dc); @@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_8x8(dst, stride, dc); @@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { - const uint16x8x2_t ref_u16 = vld2q_u16(ref); - const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint16_t *ref) { + const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); + const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); + const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); + return horizontal_add_uint16x8(p0); } static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); for (i = 0; i < 16; ++i, dst += stride) { - vst2q_u16(dst, dc_dup); + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); } } void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t a = vld2q_u16(above); - const uint16x8x2_t l = vld2q_u16(left); - const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t pa = vaddq_u16(a0, a1); + const uint16x8_t pl = vaddq_u16(l0, l1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum; - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)bd; - pal1 = vpadd_u16(pal1, pal1); - sum = vpaddl_u16(pal1); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); dc_store_16x16(dst, stride, dc); } void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(left); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)above; (void)bd; dc_store_16x16(dst, stride, dc); @@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(above); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)left; (void)bd; dc_store_16x16(dst, stride, dc); @@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { - const uint16x8x4_t r = vld4q_u16(ref); - const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); - const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); +static INLINE uint32_t dc_sum_32(const uint16_t *ref) { + const uint16x8_t r0 = vld1q_u16(ref + 0); + const uint16x8_t r1 = vld1q_u16(ref + 8); + const uint16x8_t r2 = vld1q_u16(ref + 16); + const uint16x8_t r3 = vld1q_u16(ref + 24); + const uint16x8_t p0 = vaddq_u16(r0, r1); + const uint16x8_t p1 = vaddq_u16(r2, r3); const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpaddl_u16(sum); + return horizontal_add_uint16x8(p2); } static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); - for (i = 0; i < 32; ++i) { - vst2q_u16(dst, dc_dup); - dst += 16; - vst2q_u16(dst, dc_dup); - dst += stride - 16; + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + vst1q_u16(dst + 16, dc); + vst1q_u16(dst + 24, dc); + dst += stride; } } void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x4_t a = vld4q_u16(above); - const uint16x8x4_t l = vld4q_u16(left); - const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); - const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); - const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t a2 = vld1q_u16(above + 16); + const uint16x8_t a3 = vld1q_u16(above + 24); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t l2 = vld1q_u16(left + 16); + const uint16x8_t l3 = vld1q_u16(left + 24); + const uint16x8_t pa0 = vaddq_u16(a0, a1); + const uint16x8_t pa1 = vaddq_u16(a2, a3); + const uint16x8_t pl0 = vaddq_u16(l0, l1); + const uint16x8_t pl1 = vaddq_u16(l2, l3); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum = vpaddl_u16(pal1); - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0); (void)bd; - sum = vpadd_u32(sum, sum); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); dc_store_32x32(dst, stride, dc); } void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(left); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(left); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)above; (void)bd; dc_store_32x32(dst, stride, dc); @@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(above); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(above); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)left; (void)bd; dc_store_32x32(dst, stride, dc); @@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -289,166 +276,1304 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t ABCDEFGH = vld1q_u16(above); - const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); - const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); - const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); - const uint16x4_t avg2_low = vget_low_u16(avg2); - const uint16x4_t avg2_high = vget_high_u16(avg2); - const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); - const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); - const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + uint16x8_t a0, a1, a2, d0; + uint16_t a7; (void)left; (void)bd; - vst1_u16(dst, avg2_low); - dst += stride; - vst1_u16(dst, r1); - dst += stride; - vst1_u16(dst, r2); - dst += stride; - vst1_u16(dst, r3); - vst1q_lane_u16(dst + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row) { - *row = vextq_u16(*row, above_right, 1); - vst1q_u16(*dst, *row); - *dst += stride; + a0 = vld1q_u16(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vextq_u16(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vextq_u16(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + vst1_u16(dst + 0 * stride, vget_low_u16(d0)); + vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1))); + vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2))); + vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3))); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0 = vld1q_u16(above); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); - const uint16x8_t A1 = vld1q_u16(above + 1); - const uint16x8_t A2 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(A0, A2); - uint16x8_t row = vrhaddq_u16(avg1, A1); + uint16x8_t ax0, a0, a1, a7, d0; (void)left; (void)bd; - vst1q_u16(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1q_u16(dst, above_right); -} - -static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row_0, - uint16x8_t *row_1) { - *row_0 = vextq_u16(*row_0, *row_1, 1); - *row_1 = vextq_u16(*row_1, above_right, 1); - vst1q_u16(*dst, *row_0); - *dst += 8; - vst1q_u16(*dst, *row_1); - *dst += stride - 8; + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_dup_u16(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5)); + vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7)); + vst1q_u16(dst + 7 * stride, a7); } void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2]; (void)left; (void)bd; - vst1q_u16(dst, row_0); - vst1q_u16(dst + 8, row_1); - dst += stride; - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - vst1q_u16(dst, above_right); - vst1q_u16(dst + 8, above_right); + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_dup_u16(above + 15); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // We have one unused lane here to leave room to shift in above[15] in the + // last lane: + // d0[0][1] = x (don't care) + // d0[0][1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[0][7] = AVG3(above[6], above[7], above[8]); + // d0[1][0] = AVG3(above[7], above[8], above[9]); + // ... + // d0[1][7] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + + // Incrementally shift in duplicates of above[15]. + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 7 * stride + 0, d0[1]); + vst1q_u16(dst + 7 * stride + 8, a15); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 8 * stride + 8, a15); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 9 * stride + 8, a15); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 10 * stride + 8, a15); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 11 * stride + 8, a15); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 12 * stride + 8, a15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 13 * stride + 8, a15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, a15); + vst1q_u16(dst + 15 * stride + 8, a15); } void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t A0_2 = vld1q_u16(above + 16); - const uint16x8_t A0_3 = vld1q_u16(above + 24); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A1_2 = vld1q_u16(above + 17); - const uint16x8_t A1_3 = vld1q_u16(above + 25); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t A2_2 = vld1q_u16(above + 18); - const uint16x8_t A2_3 = vld1q_u16(above + 26); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); - const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); - uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); - uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4]; int i; (void)left; (void)bd; - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; - - for (i = 0; i < 30; ++i) { - row_0 = vextq_u16(row_0, row_1, 1); - row_1 = vextq_u16(row_1, row_2, 1); - row_2 = vextq_u16(row_2, row_3, 1); - row_3 = vextq_u16(row_3, above_right, 1); - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a31 = vld1q_dup_u16(above + 31); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16); + d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24); + + for (i = 0; i < 32; ++i) { + d0[0] = vextq_u16(d0[0], d0[1], 1); + d0[1] = vextq_u16(d0[1], d0[2], 1); + d0[2] = vextq_u16(d0[2], d0[3], 1); + d0[3] = vextq_u16(d0[3], a31, 1); + vst1q_u16(dst + 0, d0[0]); + vst1q_u16(dst + 8, d0[1]); + vst1q_u16(dst + 16, d0[2]); + vst1q_u16(dst + 24, d0[3]); + dst += stride; } +} - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); +// ----------------------------------------------------------------------------- + +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + (void)bd; + + a0 = vld1_u16(above + 0); + a1 = vld1_u16(above + 1); + a2 = vld1_u16(above + 2); + a3 = vld1_u16(above + 3); + + d0 = vrhadd_u16(a0, a1); + d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); + d2 = vrhadd_u16(a1, a2); + d3 = vrhadd_u16(vhadd_u16(a1, a3), a2); + + // Note that here we are performing a full avg calculation for the final + // elements rather than storing a duplicate of above[3], which differs + // (correctly) from the general scheme employed by the bs={8,16,32} + // implementations in order to match the original C implementation. + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, d2); + vst1_u16(dst + 3 * stride, d3); +} + +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a7 = vld1q_dup_u16(above + 7); + + d0 = vrhaddq_u16(a0, a1); + d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want to store: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ] + // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ] + // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ] + // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ] + // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ] + // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ] + // Note in particular that d0[7] and d1[7] are only ever referenced in the + // stride=0 and stride=1 cases respectively, and in later strides are + // replaced by a copy of above[7]. These are equivalent if for i>7, + // above[i]==above[7], however that is not always the case. + + // Strip out d0[7] and d1[7] so that we can replace it with an additional + // copy of above[7], the first vector here doesn't matter so just reuse + // d0/d1. + d0_ext = vextq_u16(d0, d0, 7); + d1_ext = vextq_u16(d1, d1, 7); + + // Shuffle in duplicates of above[7] and store. + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2)); + vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3)); + vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4)); +} + +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a15 = vld1q_dup_u16(above + 15); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[1], d0[1], 7); + d1_ext = vextq_u16(d1[1], d1[1], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4)); + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6)); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, a15); +} + +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], + d1[4], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a18 = vld1q_u16(above + 18); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a26 = vld1q_u16(above + 26); + a31 = vld1q_dup_u16(above + 31); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d0[2] = vrhaddq_u16(a16, a17); + d0[3] = vrhaddq_u16(a24, a25); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); + d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[3], d0[3], 7); + d1_ext = vextq_u16(d1[3], d1[3], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 14 * stride + 24, a31); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 15 * stride + 24, a31); + + vst1q_u16(dst + 16 * stride + 0, d0[1]); + vst1q_u16(dst + 16 * stride + 8, d0[2]); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1)); + vst1q_u16(dst + 16 * stride + 24, a31); + vst1q_u16(dst + 17 * stride + 0, d1[1]); + vst1q_u16(dst + 17 * stride + 8, d1[2]); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1)); + vst1q_u16(dst + 17 * stride + 24, a31); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 18 * stride + 24, a31); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2)); + vst1q_u16(dst + 19 * stride + 24, a31); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 20 * stride + 24, a31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3)); + vst1q_u16(dst + 21 * stride + 24, a31); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 22 * stride + 24, a31); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4)); + vst1q_u16(dst + 23 * stride + 24, a31); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 24 * stride + 24, a31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5)); + vst1q_u16(dst + 25 * stride + 24, a31); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 26 * stride + 24, a31); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6)); + vst1q_u16(dst + 27 * stride + 24, a31); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 28 * stride + 24, a31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7)); + vst1q_u16(dst + 29 * stride + 24, a31); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 30 * stride + 16, a31); + vst1q_u16(dst + 30 * stride + 24, a31); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 31 * stride + 16, a31); + vst1q_u16(dst + 31 * stride + 24, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left + 0); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(az, a0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + + col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + col0_even = vdup_lane_u16(col0, 0); + col0_odd = vdup_lane_u16(col0, 1); + + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3)); + vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3)); +} + +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhaddq_u16(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[0] = AVG3(left[6], left[7], left[8]) + col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0 = vrev64q_u16(vextq_u16(col0, col0, 4)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzpq_u16(col0, col0).val[1]; + col0_odd = vuzpq_u16(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7)); + vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7)); + vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6)); + vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5)); + vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5)); +} + +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo, + col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0_lo = vrhaddq_u16(az, a0); + d0_hi = vrhaddq_u16(a7, a8); + d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + + col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + // Reverse within each vector, then swap the array indices in the uzp to + // complete the reversal across all 16 elements. + col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4)); + col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4)); + col0_even = vuzpq_u16(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0_lo); + vst1q_u16(dst + 0 * stride + 8, d0_hi); + vst1q_u16(dst + 1 * stride + 0, d1_lo); + vst1q_u16(dst + 1 * stride + 8, d1_hi); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1)); +} + +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4], + col0_even[2], col0_odd[2]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + l25 = vld1q_u16(left + 25); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(az, a0); + d0[1] = vrhaddq_u16(a7, a8); + d0[2] = vrhaddq_u16(a15, a16); + d0[3] = vrhaddq_u16(a23, a24); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + // Reverse within each vector, then swap the array indices in both the uzp + // and the col0_{even,odd} assignment to complete the reversal across all + // 32-elements. + col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4)); + col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4)); + col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4)); + col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4)); + + col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1]; + col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1]; + col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0]; + col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 16 * stride + 0, col0_even[1]); + vst1q_u16(dst + 16 * stride + 8, d0[0]); + vst1q_u16(dst + 16 * stride + 16, d0[1]); + vst1q_u16(dst + 16 * stride + 24, d0[2]); + vst1q_u16(dst + 17 * stride + 0, col0_odd[1]); + vst1q_u16(dst + 17 * stride + 8, d1[0]); + vst1q_u16(dst + 17 * stride + 16, d1[1]); + vst1q_u16(dst + 17 * stride + 24, d1[2]); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6)); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4)); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2)); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(azl0, l0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + + d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0]; + d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1] ] + vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3)); + vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1)); + vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3)); + vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo, + d20_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhaddq_u16(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[7] = AVG3(left[6], left[7], left[8]) + d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end: + d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4)); + d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4)); + + d20_lo = vzipq_u16(d2_rev, d0_rev).val[0]; + d20_hi = vzipq_u16(d2_rev, d0_rev).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7)); + vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5)); + vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1)); + vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7)); + vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5)); + vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3)); + vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2], + d2[2], d20[4]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + + d20[0] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[1] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[2] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[3] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1)); +} + +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d0[2] = vrhaddq_u16(l15, l16); + d0[3] = vrhaddq_u16(l23, l24); + + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4)); + d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4)); + d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4)); + + d20[0] = vzipq_u16(d2[3], d0[3]).val[0]; + d20[1] = vzipq_u16(d2[3], d0[3]).val[1]; + d20[2] = vzipq_u16(d2[2], d0[2]).val[0]; + d20[3] = vzipq_u16(d2[2], d0[2]).val[1]; + d20[4] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[5] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[6] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[7] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1)); + + vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1)); } // ----------------------------------------------------------------------------- @@ -696,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1_u16(left + 0); + l3 = vld1_dup_u16(left + 3); + + // [ left[1], left[2], left[3], left[3] ] + l1 = vext_u16(l0, l3, 1); + // [ left[2], left[3], left[3], left[3] ] + l2 = vext_u16(l0, l3, 2); + + c0 = vrhadd_u16(l0, l1); + c1 = vrhadd_u16(vhadd_u16(l0, l2), l1); + + c01_lo = vzip_u16(c0, c1).val[0]; + c01_hi = vzip_u16(c0, c1).val[1]; + + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + vst1_u16(dst + 0 * stride, c01_lo); + vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2)); + vst1_u16(dst + 2 * stride, c01_hi); + vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2)); +} + +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l7 = vld1q_dup_u16(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vextq_u16(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vextq_u16(l0, l7, 2); + + c0 = vrhaddq_u16(l0, l1); + c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + + c01_lo = vzipq_u16(c0, c1).val[0]; + c01_hi = vzipq_u16(c0, c1).val[1]; + + vst1q_u16(dst + 0 * stride, c01_lo); + vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4)); + vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6)); + vst1q_u16(dst + 4 * stride, c01_hi); + vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2)); + vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6)); +} + +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l15 = vld1q_dup_u16(left + 15); + + l9 = vextq_u16(l8, l15, 1); + l10 = vextq_u16(l8, l15, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, l15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 13 * stride + 8, l15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 14 * stride + 8, l15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6)); + vst1q_u16(dst + 15 * stride + 8, l15); +} + +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4], + c1[4], c01[8]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l10 = vld1q_u16(left + 10); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l18 = vld1q_u16(left + 18); + l24 = vld1q_u16(left + 24); + l31 = vld1q_dup_u16(left + 31); + + l25 = vextq_u16(l24, l31, 1); + l26 = vextq_u16(l24, l31, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c0[2] = vrhaddq_u16(l16, l17); + c0[3] = vrhaddq_u16(l24, l25); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17); + c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + c01[4] = vzipq_u16(c0[2], c1[2]).val[0]; + c01[5] = vzipq_u16(c0[2], c1[2]).val[1]; + c01[6] = vzipq_u16(c0[3], c1[3]).val[0]; + c01[7] = vzipq_u16(c0[3], c1[3]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 0 * stride + 16, c01[2]); + vst1q_u16(dst + 0 * stride + 24, c01[3]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 4 * stride + 16, c01[3]); + vst1q_u16(dst + 4 * stride + 24, c01[4]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 8 * stride + 16, c01[4]); + vst1q_u16(dst + 8 * stride + 24, c01[5]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, c01[4]); + vst1q_u16(dst + 12 * stride + 16, c01[5]); + vst1q_u16(dst + 12 * stride + 24, c01[6]); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6)); + + vst1q_u16(dst + 16 * stride + 0, c01[4]); + vst1q_u16(dst + 16 * stride + 8, c01[5]); + vst1q_u16(dst + 16 * stride + 16, c01[6]); + vst1q_u16(dst + 16 * stride + 24, c01[7]); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6)); + + vst1q_u16(dst + 20 * stride + 0, c01[5]); + vst1q_u16(dst + 20 * stride + 8, c01[6]); + vst1q_u16(dst + 20 * stride + 16, c01[7]); + vst1q_u16(dst + 20 * stride + 24, l31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 24 * stride + 0, c01[6]); + vst1q_u16(dst + 24 * stride + 8, c01[7]); + vst1q_u16(dst + 24 * stride + 16, l31); + vst1q_u16(dst + 24 * stride + 24, l31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 28 * stride + 0, c01[7]); + vst1q_u16(dst + 28 * stride + 8, l31); + vst1q_u16(dst + 28 * stride + 16, l31); + vst1q_u16(dst + 28 * stride + 24, l31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6)); +} + +//------------------------------------------------------------------------------ + void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { @@ -725,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index b9f72a94c..c2ad34a69 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,8 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, @@ -94,26 +96,25 @@ highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr)); - int32x4_t round = vmovl_s16(vld1_s16(round_ptr)); + int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin)); + int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round)); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -164,7 +165,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -174,11 +175,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)n_coeffs; - (void)scan; +#endif // VPX_ARCH_AARCH64 } static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -224,25 +221,25 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( } void vpx_highbd_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); - int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -289,7 +286,7 @@ void vpx_highbd_quantize_b_32x32_neon( } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -299,9 +296,5 @@ void vpx_highbd_quantize_b_32x32_neon( const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)n_coeffs; - (void)scan; +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c new file mode 100644 index 000000000..a6684b053 --- /dev/null +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); + + } while (++i < h); + + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0, s1; + + s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int w, + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3; + + s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); +} + +static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); +} + +#define HBD_SAD_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +HBD_SAD_WXH_4D_NEON(4, 4) +HBD_SAD_WXH_4D_NEON(4, 8) + +HBD_SAD_WXH_4D_NEON(8, 4) +HBD_SAD_WXH_4D_NEON(8, 8) +HBD_SAD_WXH_4D_NEON(8, 16) + +HBD_SAD_WXH_4D_NEON(16, 8) +HBD_SAD_WXH_4D_NEON(16, 16) +HBD_SAD_WXH_4D_NEON(16, 32) + +HBD_SAD_WXH_4D_NEON(32, 16) +HBD_SAD_WXH_4D_NEON(32, 32) +HBD_SAD_WXH_4D_NEON(32, 64) + +HBD_SAD_WXH_4D_NEON(64, 32) +HBD_SAD_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_WXH_4D_NEON + +#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +HBD_SAD_SKIP_WXH_4D_NEON(4, 4) +HBD_SAD_SKIP_WXH_4D_NEON(4, 8) + +HBD_SAD_SKIP_WXH_4D_NEON(8, 4) +HBD_SAD_SKIP_WXH_4D_NEON(8, 8) +HBD_SAD_SKIP_WXH_4D_NEON(8, 16) + +HBD_SAD_SKIP_WXH_4D_NEON(16, 8) +HBD_SAD_SKIP_WXH_4D_NEON(16, 16) +HBD_SAD_SKIP_WXH_4D_NEON(16, 32) + +HBD_SAD_SKIP_WXH_4D_NEON(32, 16) +HBD_SAD_SKIP_WXH_4D_NEON(32, 32) +HBD_SAD_SKIP_WXH_4D_NEON(32, 64) + +HBD_SAD_SKIP_WXH_4D_NEON(64, 32) +HBD_SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_4D_NEON diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index ecb52ce5a..b99bac66c 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -17,209 +17,392 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); - sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); - } + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); - sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), - vget_high_u16(ref_u16)); - } + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + sum = vabaq_u16(sum, s, r); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint16x8(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); - const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); - const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); - sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); - } + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint16x8_t diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); - const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); - const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); - } + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3; + uint16x8_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + diff2 = vabdq_u16(s2, r2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + diff3 = vabdq_u16(s3, r3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum[0]); } -#define highbd_sad4MxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ - } +static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} -#define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ - } +static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} -#define highbd_sad4MxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ +#define HBD_SAD_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } -#define highbd_sadMxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ +HBD_SAD_WXH_NEON(4, 4) +HBD_SAD_WXH_NEON(4, 8) + +HBD_SAD_WXH_NEON(8, 4) +HBD_SAD_WXH_NEON(8, 8) +HBD_SAD_WXH_NEON(8, 16) + +HBD_SAD_WXH_NEON(16, 8) +HBD_SAD_WXH_NEON(16, 16) +HBD_SAD_WXH_NEON(16, 32) + +HBD_SAD_WXH_NEON(32, 16) +HBD_SAD_WXH_NEON(32, 32) +HBD_SAD_WXH_NEON(32, 64) + +HBD_SAD_WXH_NEON(64, 32) +HBD_SAD_WXH_NEON(64, 64) + +#undef HBD_SAD_WXH_NEON + +#define HBD_SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ } -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_neon( \ - const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[4], int ref_stride, \ - uint32_t sad_array[4]) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ - ref_array[i], ref_stride); \ - } \ +HBD_SAD_SKIP_WXH_NEON(4, 4) +HBD_SAD_SKIP_WXH_NEON(4, 8) + +HBD_SAD_SKIP_WXH_NEON(8, 4) +HBD_SAD_SKIP_WXH_NEON(8, 8) +HBD_SAD_SKIP_WXH_NEON(8, 16) + +HBD_SAD_SKIP_WXH_NEON(16, 8) +HBD_SAD_SKIP_WXH_NEON(16, 16) +HBD_SAD_SKIP_WXH_NEON(16, 32) + +HBD_SAD_SKIP_WXH_NEON(32, 16) +HBD_SAD_SKIP_WXH_NEON(32, 32) +HBD_SAD_SKIP_WXH_NEON(32, 64) + +HBD_SAD_SKIP_WXH_NEON(64, 32) +HBD_SAD_SKIP_WXH_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_NEON + +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 4; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + uint16x8_t diff = vabdq_u16(s, avg); + sum = vpadalq_u16(sum, diff); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 8; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1, diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + diff2 = vabdq_u16(s2, avg2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + diff3 = vabdq_u16(s3, avg3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); +} + +static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } -/* clang-format off */ -// 4x4 -highbd_sad4MxN(4, 4) -highbd_sad4MxN_avg(4, 4) -highbd_sadMxNx4D(4, 4) - -// 4x8 -highbd_sad4MxN(4, 8) -highbd_sad4MxN_avg(4, 8) -highbd_sadMxNx4D(4, 8) - -// 8x4 -highbd_sadMxN(8, 4) -highbd_sadMxN_avg(8, 4) -highbd_sadMxNx4D(8, 4) - -// 8x8 -highbd_sadMxN(8, 8) -highbd_sadMxN_avg(8, 8) -highbd_sadMxNx4D(8, 8) - -// 8x16 -highbd_sadMxN(8, 16) -highbd_sadMxN_avg(8, 16) -highbd_sadMxNx4D(8, 16) - -// 16x8 -highbd_sadMxN(16, 8) -highbd_sadMxN_avg(16, 8) -highbd_sadMxNx4D(16, 8) - -// 16x16 -highbd_sadMxN(16, 16) -highbd_sadMxN_avg(16, 16) -highbd_sadMxNx4D(16, 16) - -// 16x32 -highbd_sadMxN(16, 32) -highbd_sadMxN_avg(16, 32) -highbd_sadMxNx4D(16, 32) - -// 32x16 -highbd_sadMxN(32, 16) -highbd_sadMxN_avg(32, 16) -highbd_sadMxNx4D(32, 16) - -// 32x32 -highbd_sadMxN(32, 32) -highbd_sadMxN_avg(32, 32) -highbd_sadMxNx4D(32, 32) - -// 32x64 -highbd_sadMxN(32, 64) -highbd_sadMxN_avg(32, 64) -highbd_sadMxNx4D(32, 64) - -// 64x32 -highbd_sadMxN(64, 32) -highbd_sadMxN_avg(64, 32) -highbd_sadMxNx4D(64, 32) - -// 64x64 -highbd_sadMxN(64, 64) -highbd_sadMxN_avg(64, 64) -highbd_sadMxNx4D(64, 64) - /* clang-format on */ +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) + +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) + +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) + +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) + +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c new file mode 100644 index 000000000..ee76bed58 --- /dev/null +++ b/vpx_dsp/arm/highbd_sse_neon.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src, + const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); +} + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); +} + +static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[4]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x4(sse); +} + +static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2]; + highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x2(sse); +} + +static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + // Peel the first loop iteration. + uint16x4_t s = vld1_u16(src); + uint16x4_t r = vld1_u16(ref); + + uint16x4_t abs_diff = vabd_u16(s, r); + uint32x4_t sse = vmull_u16(abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + s = vld1_u16(src); + r = vld1_u16(ref); + + abs_diff = vabd_u16(s, r); + sse = vmlal_u16(sse, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4(sse); +} + +static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + // { 0, 1, 2, 3, 4, 5, 6, 7 } + uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); + uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); + uint64_t sse = 0; + + do { + int w = width; + int offset = 0; + + do { + uint16x8_t s = vld1q_u16(src + offset); + uint16x8_t r = vld1q_u16(ref + offset); + uint16x8_t abs_diff; + uint16x4_t abs_diff_lo; + uint16x4_t abs_diff_hi; + uint32x4_t sse_u32; + + if (w < 8) { + // Mask out-of-range elements. + s = vandq_u16(s, remainder_mask); + r = vandq_u16(r, remainder_mask); + } + + abs_diff = vabdq_u16(s, r); + abs_diff_lo = vget_low_u16(abs_diff); + abs_diff_hi = vget_high_u16(abs_diff); + + sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); + sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); + + sse += horizontal_long_add_uint32x4(sse_u32); + + offset += 8; + w -= 8; + } while (w > 0); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return sse; +} + +int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: + return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: + return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 000000000..683df5797 --- /dev/null +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and any height. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, blend); + + src_ptr += src_stride; + dst_ptr += 4; + } while (--i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, vrhadd_u16(blend, p)); + + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 96a35af01..309ae7fd3 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -18,479 +18,419 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - -static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - int w, int h, uint64_t *sse, - int64_t *sum) { - int i, j; - - if (w >= 8) { - int32x4_t sum_s32 = vdupq_n_s32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j])); - const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j])); - const int32x4_t diff1_s32 = - vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16)); - const int32x4_t diff2_s32 = - vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16)); - const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32); - const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32); - sum_s32 = vaddq_s32(sum_s32, diff1_s32); - sum_s32 = vaddq_s32(sum_s32, diff2_s32); - sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32); - sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32); - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - *sum = horizontal_add_int32x4(sum_s32); - *sse = horizontal_add_uint32x4(sse_u32); - } else { - int32x4_t sum_s32 = vdupq_n_s32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - assert(w >= 4); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 4) { - const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j])); - const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j])); - const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16); - const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32); - sum_s32 = vaddq_s32(sum_s32, diff_s32); - sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32); - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - *sum = horizontal_add_int32x4(sum_s32); - *sse = horizontal_add_uint32x4(sse_u32); - } +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + int i = h; + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int32x4(sse_s32); } -static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint64_t *sse, - int64_t *sum) { - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); - uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); - - if (w < 32 && h < 32) { - highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum); - } else { - uint64_t sse_long = 0; - int64_t sum_long = 0; - int k, l; - for (k = 0; k + 16 <= h; k += 16) { - for (l = 0; l + 16 <= w; l += 16) { - uint64_t sse_tmp = 0; - int64_t sum_tmp = 0; - highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16, - 16, &sse_tmp, &sum_tmp); - sum_long += sum_tmp; - sse_long += sse_tmp; - } - src_ptr += 16 * src_stride; - ref_ptr += 16 * ref_stride; - } - *sum = sum_long; - *sse = sse_long; - } +// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all +// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696 +// for a 64x64 block). +static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + int j = 0; + do { + const uint16x8_t s = vld1q_u16(src_ptr + j); + const uint16x8_t r = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_long_add_uint32x4(vaddq_u32( + vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); } -static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)sse_long; - *sum = (int)sum_long; +static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); } -static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +static INLINE void highbd_variance_16xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); } -static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +static INLINE void highbd_variance_32xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); } -#define HIGHBD_VAR(W, H) \ - uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t vpx_highbd_10_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t vpx_highbd_12_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } +static INLINE void highbd_variance_64xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} -#define HIGHBD_GET_VAR(S) \ - void vpx_highbd_8_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ - } \ - \ - void vpx_highbd_10_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ - } \ - \ - void vpx_highbd_12_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ - } +// For 12-bit data, we can only accumulate up to 128 elements in the sum of +// squares (4095*4095*128 = 2146435200), and because we're using two int32x4 +// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) +// or 16 64-element rows before we have to accumulate into 64-bit elements. +// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different +// helper function. -#define HIGHBD_MSE(W, H) \ - uint32_t vpx_highbd_8_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ - } +// Process a block of any size where the width is divisible by 8, with +// accumulation into 64-bit elements. +static INLINE void highbd_variance_xlarge_neon( + const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, + int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); -static INLINE void highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, uint16_t *output_ptr, - unsigned int src_pixels_per_line, int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + do { + int j = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + j); + const uint16x8_t r0 = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); + sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint64_t)horizontal_add_int64x2(sse_s64); } -static INLINE void highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, uint16_t *output_ptr, - unsigned int src_pixels_per_line, unsigned int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } +static INLINE void highbd_variance_32xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, + sum); +} + +static INLINE void highbd_variance_64xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, + sum); } -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ +#define HBD_VARIANCE_WXH_8_NEON(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ +#define HBD_VARIANCE_WXH_10_NEON(w, h) \ + uint32_t vpx_highbd_10_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { - int i, j; - uint32x4_t one_u32 = vdupq_n_u32(1); - if (width >= 8) { - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 8) { - const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); - const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); - const uint32x4_t sum1_u32 = - vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); - const uint32x4_t sum2_u32 = - vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); - const uint16x4_t sum1_u16 = - vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); - const uint16x4_t sum2_u16 = - vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); - const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); - vst1q_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } else { - assert(width >= 4); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 4) { - const uint16x4_t pred_u16 = vld1_u16(&pred[j]); - const uint16x4_t ref_u16 = vld1_u16(&ref[j]); - const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); - const uint16x4_t vcomp_pred = - vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); - vst1_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } +#define HBD_VARIANCE_WXH_12_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -} -/* All three forms of the variance are available in the same sizes. */ -#define HIGHBD_VARIANCES(W, H) \ - HIGHBD_VAR(W, H) \ - HIGHBD_SUBPIX_VAR(W, H) \ - HIGHBD_SUBPIX_AVG_VAR(W, H) - -HIGHBD_VARIANCES(64, 64) -HIGHBD_VARIANCES(64, 32) -HIGHBD_VARIANCES(32, 64) -HIGHBD_VARIANCES(32, 32) -HIGHBD_VARIANCES(32, 16) -HIGHBD_VARIANCES(16, 32) -HIGHBD_VARIANCES(16, 16) -HIGHBD_VARIANCES(16, 8) -HIGHBD_VARIANCES(8, 16) -HIGHBD_VARIANCES(8, 8) -HIGHBD_VARIANCES(8, 4) -HIGHBD_VARIANCES(4, 8) -HIGHBD_VARIANCES(4, 4) +#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 4) +HBD_VARIANCE_WXH_8_NEON(4, 8) + +HBD_VARIANCE_WXH_8_NEON(8, 4) +HBD_VARIANCE_WXH_8_NEON(8, 8) +HBD_VARIANCE_WXH_8_NEON(8, 16) + +HBD_VARIANCE_WXH_8_NEON(16, 8) +HBD_VARIANCE_WXH_8_NEON(16, 16) +HBD_VARIANCE_WXH_8_NEON(16, 32) + +HBD_VARIANCE_WXH_8_NEON(32, 16) +HBD_VARIANCE_WXH_8_NEON(32, 32) +HBD_VARIANCE_WXH_8_NEON(32, 64) + +HBD_VARIANCE_WXH_8_NEON(64, 32) +HBD_VARIANCE_WXH_8_NEON(64, 64) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 4) +HBD_VARIANCE_WXH_10_NEON(4, 8) + +HBD_VARIANCE_WXH_10_NEON(8, 4) +HBD_VARIANCE_WXH_10_NEON(8, 8) +HBD_VARIANCE_WXH_10_NEON(8, 16) + +HBD_VARIANCE_WXH_10_NEON(16, 8) +HBD_VARIANCE_WXH_10_NEON(16, 16) +HBD_VARIANCE_WXH_10_NEON(16, 32) + +HBD_VARIANCE_WXH_10_NEON(32, 16) +HBD_VARIANCE_WXH_10_NEON(32, 32) +HBD_VARIANCE_WXH_10_NEON(32, 64) + +HBD_VARIANCE_WXH_10_NEON(64, 32) +HBD_VARIANCE_WXH_10_NEON(64, 64) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 4) +HBD_VARIANCE_WXH_12_NEON(4, 8) + +HBD_VARIANCE_WXH_12_NEON(8, 4) +HBD_VARIANCE_WXH_12_NEON(8, 8) +HBD_VARIANCE_WXH_12_NEON(8, 16) + +HBD_VARIANCE_WXH_12_NEON(16, 8) +HBD_VARIANCE_WXH_12_NEON(16, 16) +HBD_VARIANCE_WXH_12_NEON(16, 32) + +HBD_VARIANCE_WXH_12_NEON(32, 16) +HBD_VARIANCE_WXH_12_NEON(32, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h); +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h); +} + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) + +#undef HIGHBD_MSE_WXH_NEON diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c new file mode 100644 index 000000000..1a8872017 --- /dev/null +++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = \ + highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) + +#undef HIGHBD_MSE_WXH_NEON_DOTPROD diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index c46c01631..47684473c 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -355,7 +355,6 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, } else { const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); @@ -365,6 +364,7 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, if (h == 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int32x4_t d0, d1, d2, d3; + uint16x8_t t0, t1, t2, t3; uint16x8_t d01, d23, t01, t23; __builtin_prefetch(src + 0 * src_stride); diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 9d2752e09..775108208 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, (void)bd; if (w < 8) { // copy4 + uint16x4_t s0, s1; do { - vst1_u16(dst, vld1_u16(src)); + s0 = vld1_u16(src); src += src_stride; - dst += dst_stride; - vst1_u16(dst, vld1_u16(src)); + s1 = vld1_u16(src); src += src_stride; + + vst1_u16(dst, s0); + dst += dst_stride; + vst1_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 + uint16x8_t s0, s1; do { - vst1q_u16(dst, vld1q_u16(src)); + s0 = vld1q_u16(src); src += src_stride; - dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); + s1 = vld1q_u16(src); src += src_stride; + + vst1q_u16(dst, s0); + dst += dst_stride; + vst1q_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; do { - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + h -= 2; + } while (h != 0); } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; do { - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); + dst += dst_stride; + } while (--h != 0); } } diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 38e275834..4f909e493 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -12,51 +12,47 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "mem_neon.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - const uint16x4_t p0 = vpaddl_u8(ref_u8); - return vpadd_u16(p0, p0); +static INLINE uint16_t dc_sum_4(const uint8_t *ref) { + return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref)); } static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t a = vld1_u8(above); - const uint8x8_t l = vld1_u8(left); - const uint16x8_t al = vaddl_u8(a, l); - uint16x4_t sum; - uint8x8_t dc; - sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint8x8_t a = load_unaligned_u8_4x1(above); + const uint8x8_t l = load_unaligned_u8_4x1(left); + const uint16x4_t al = vget_low_u16(vaddl_u8(a, l)); + const uint16_t sum = horizontal_add_uint16x4(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)above; dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)left; dc_store_4x4(dst, stride, dc); } @@ -72,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - uint16x4_t sum = vpaddl_u8(ref_u8); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_8(const uint8_t *ref) { + return horizontal_add_uint8x8(vld1_u8(ref)); } static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1_u8(dst, dc_dup); + vst1_u8(dst, dc); } } @@ -92,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t above_u8 = vld1_u8(above); const uint8x8_t left_u8 = vld1_u8(left); - const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); - const uint16x8_t p0 = vpaddlq_u8(above_and_left); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16x8_t al = vaddl_u8(above_u8, left_u8); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4); dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)above; dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)left; dc_store_8x8(dst, stride, dc); } @@ -129,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { - const uint8x16_t ref_u8 = vld1q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(ref_u8); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint8_t *ref) { + return horizontal_add_uint8x16(vld1q_u8(ref)); } static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + const uint8x16_t dc) { int i; for (i = 0; i < 16; ++i, dst += stride) { - vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); } } @@ -150,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t ref0 = vld1q_u8(above); const uint8x16_t ref1 = vld1q_u8(left); - const uint16x8_t p0 = vpaddlq_u8(ref0); - const uint16x8_t p1 = vpaddlq_u8(ref1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16x8_t a = vpaddlq_u8(ref0); + const uint16x8_t l = vpaddlq_u8(ref1); + const uint16x8_t al = vaddq_u16(a, l); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)above; dc_store_16x16(dst, stride, dc); } @@ -173,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)left; dc_store_16x16(dst, stride, dc); } @@ -182,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -191,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { - const uint8x16x2_t r = vld2q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(r.val[0]); - const uint16x8_t p1 = vpaddlq_u8(r.val[1]); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_32(const uint8_t *ref) { + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1)); + return horizontal_add_uint16x8(r01); } static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - uint8x16x2_t dc_dup; + const uint8x16_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - for (i = 0; i < 32; ++i, dst += stride) { - vst2q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); + vst1q_u8(dst + 16, dc); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16x2_t a = vld2q_u8(above); - const uint8x16x2_t l = vld2q_u8(left); - const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); - const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); - const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); - const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); - const uint16x8_t pa = vaddq_u16(pa0, pa1); - const uint16x8_t pl = vaddq_u16(pl0, pl1); - const uint16x8_t pal = vaddq_u16(pa, pl); - uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1)); + const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1)); + const uint16x8_t al = vaddq_u16(a01, l01); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0); dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)above; dc_store_32x32(dst, stride, dc); } @@ -243,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)left; dc_store_32x32(dst, stride, dc); } @@ -252,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -262,123 +232,629 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t ABCDEFGH = vld1_u8(above); - const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); - const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); - const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); - const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); - const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); - const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r0 = vreinterpret_u32_u8(avg2); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + uint8x8_t a0, a1, a2, d0; + uint8_t a7; (void)left; - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, - const uint8x8_t above_right, uint8x8_t *row) { - *row = vext_u8(*row, above_right, 1); - vst1_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vext_u8(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vext_u8(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1)); + store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2)); + store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3)); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t A0 = vld1_u8(above); - const uint8x8_t above_right = vdup_lane_u8(A0, 7); - const uint8x8_t A1 = vext_u8(A0, above_right, 1); - const uint8x8_t A2 = vext_u8(A0, above_right, 2); - const uint8x8_t avg1 = vhadd_u8(A0, A2); - uint8x8_t row = vrhadd_u8(avg1, A1); + uint8x8_t ax0, a0, a1, a7, d0; (void)left; - vst1_u8(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1_u8(dst, above_right); -} - -static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, - const uint8x16_t above_right, uint8x16_t *row) { - *row = vextq_u8(*row, above_right, 1); - vst1q_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a7 = vld1_dup_u8(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vext_u8(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5)); + vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7)); + vst1_u8(dst + 7 * stride, a7); } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); - const uint8x16_t A1 = vextq_u8(A0, above_right, 1); - const uint8x16_t A2 = vextq_u8(A0, above_right, 2); - const uint8x16_t avg1 = vhaddq_u8(A0, A2); - uint8x16_t row = vrhaddq_u8(avg1, A1); + uint8x16_t ax0, a0, a1, a15, d0; (void)left; - vst1q_u8(dst, row); - dst += stride; - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - vst1q_u8(dst, above_right); + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_dup_u8(above + 15); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9)); + vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13)); + vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15)); + vst1q_u8(dst + 15 * stride, a15); } void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0_0 = vld1q_u8(above); - const uint8x16_t A0_1 = vld1q_u8(above + 16); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); - const uint8x16_t A1_0 = vld1q_u8(above + 1); - const uint8x16_t A1_1 = vld1q_u8(above + 17); - const uint8x16_t A2_0 = vld1q_u8(above + 2); - const uint8x16_t A2_1 = vld1q_u8(above + 18); - const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); - const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); - uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); - uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); - int i; + uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2]; (void)left; - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a31 = vld1q_dup_u8(above + 31); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 15 * stride + 0, d0[1]); + vst1q_u8(dst + 15 * stride + 16, a31); + + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 16 * stride + 16, a31); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 17 * stride + 16, a31); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 18 * stride + 16, a31); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 19 * stride + 16, a31); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 20 * stride + 16, a31); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 21 * stride + 16, a31); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 22 * stride + 16, a31); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 23 * stride + 16, a31); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 24 * stride + 16, a31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 25 * stride + 16, a31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 26 * stride + 16, a31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 27 * stride + 16, a31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 28 * stride + 16, a31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 29 * stride + 16, a31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, a31); + vst1q_u8(dst + 31 * stride + 16, a31); +} - for (i = 0; i < 30; ++i) { - row_0 = vextq_u8(row_0, row_1, 1); - row_1 = vextq_u8(row_1, above_right, 1); - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; - } +// ----------------------------------------------------------------------------- + +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + + a0 = load_unaligned_u8_4x1(above + 0); + a1 = load_unaligned_u8_4x1(above + 1); + a2 = load_unaligned_u8_4x1(above + 2); + a3 = load_unaligned_u8_4x1(above + 3); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + d2 = vrhadd_u8(a1, a2); + d3 = vrhadd_u8(vhadd_u8(a1, a3), a2); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a7, d0, d1; + (void)left; + + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a2 = vld1_u8(above + 2); + a7 = vld1_dup_u8(above + 7); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + + d0 = vext_u8(d0, d0, 7); + d1 = vext_u8(d1, d1, 7); - vst1q_u8(dst, above_right); - dst += 16; - vst1q_u8(dst, row_1); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4)); +} + +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a15, d0, d1; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a15 = vld1q_dup_u8(above + 15); + + d0 = vrhaddq_u8(a0, a1); + d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + + d0 = vextq_u8(d0, d0, 15); + d1 = vextq_u8(d1, d1, 15); + + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8)); +} + +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a18 = vld1q_u8(above + 18); + a31 = vld1q_dup_u8(above + 31); + + d0_lo = vrhaddq_u8(a0, a1); + d0_hi = vrhaddq_u8(a16, a17); + d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17); + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + + d0_hi = vextq_u8(d0_lo, d0_hi, 15); + d0_lo = vextq_u8(d0_lo, d0_lo, 15); + d1_hi = vextq_u8(d1_lo, d1_hi, 15); + d1_lo = vextq_u8(d1_lo, d1_lo, 15); + + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15)); + vst1q_u8(dst + 30 * stride + 0, d0_hi); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, d1_hi); + vst1q_u8(dst + 31 * stride + 16, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); + col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); + + d0 = vrhadd_u8(az, a0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vext_u8(col0, d0, 7); + d3 = vext_u8(col1, d1, 7); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // d0[1] = AVG2(above[0], above[1]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhadd_u8(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end. The lowest two lanes here are unused: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[2] = AVG3(left[4], left[5], left[6]) + // col0[1] = x (don't care) + // col0[0] = x (don't care) + col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzp_u8(col0, col0).val[1]; + col0_odd = vuzp_u8(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); + vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); + vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); + vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); + vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); + vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); +} + +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(az, a0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + + col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); + + // The low nine lanes here are unused so the first input to the uzp is + // unused, so just use a duplicate of col0 since we have it already. This + // also means that the lowest lane of col0 here is unused. + col0_even = vuzpq_u8(col0, col0).val[1]; + col0_odd = vuzpq_u8(col0, col0).val[0]; + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); + vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); + vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); + vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); + vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); + vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); + vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); + vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); + vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); + vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); + vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); + vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); +} + +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, + l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(az, a0); + d0_hi = vrhaddq_u8(a15, a16); + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The last lane of col0_hi is unused here. + col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); + col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); + + // The first lane of these are unused since they are only ever called as + // ext(col0, _, i) where i >= 1. + col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); } // ----------------------------------------------------------------------------- @@ -390,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8x8_t L3210 = vrev64_u8(L0123); const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); - const uint8x8_t L10XA0123_ = - vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1); const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)dst, r0, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r1, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r2, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r3, 0); + + store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3)); + store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2)); + store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1)); + store_u8_4x1(dst + 3 * stride, avg2); } void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -422,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); - const uint8x8_t row_0 = vget_low_u8(row); - const uint8x8_t row_1 = vget_high_u8(row); - const uint8x8_t r0 = vext_u8(row_0, row_1, 7); - const uint8x8_t r1 = vext_u8(row_0, row_1, 6); - const uint8x8_t r2 = vext_u8(row_0, row_1, 5); - const uint8x8_t r3 = vext_u8(row_0, row_1, 4); - const uint8x8_t r4 = vext_u8(row_0, row_1, 3); - const uint8x8_t r5 = vext_u8(row_0, row_1, 2); - const uint8x8_t r6 = vext_u8(row_0, row_1, 1); - - vst1_u8(dst, r0); - dst += stride; - vst1_u8(dst, r1); - dst += stride; - vst1_u8(dst, r2); - dst += stride; - vst1_u8(dst, r3); - dst += stride; - vst1_u8(dst, r4); - dst += stride; - vst1_u8(dst, r5); - dst += stride; - vst1_u8(dst, r6); - dst += stride; - vst1_u8(dst, row_0); + + vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7))); + vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6))); + vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5))); + vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4))); + vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3))); + vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2))); + vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1))); + vst1_u8(dst + 7 * stride, vget_low_u8(row)); } static INLINE void d135_store_16x8( @@ -489,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); @@ -496,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); - const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8); const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); @@ -667,6 +1120,454 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, d135_store_32x2(&dst, stride, row_0, row_1, row_2); } +// ----------------------------------------------------------------------------- + +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = load_unaligned_u8_4x1(left + 0); + l1 = load_unaligned_u8_4x1(left + 1); + // [ above[-1], left[0], left[1], left[2], x, x, x, x ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + d0 = vrhadd_u8(azl0, l0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + d02 = vrev64_u8(vzip_u8(d0, d2).val[0]); + + store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7)); + store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5)); + store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3)); + store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1)); +} + +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhadd_u8(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[6] = AVG3(left[5], left[6], left[7]) + // d2[7] = x (don't care) + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end. The lowest lane of d02_lo is unused. + d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0]; + d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7)); + vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5)); + vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1)); + vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7)); + vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5)); + vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3)); + vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(azl0, l0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + + d0 = vrev64q_u8(vextq_u8(d0, d0, 8)); + d2 = vrev64q_u8(vextq_u8(d2, d2, 8)); + + // The lowest lane of d02_lo is unused. + d02_lo = vzipq_u8(d2, d0).val[0]; + d02_hi = vzipq_u8(d2, d0).val[1]; + + vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15)); + vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13)); + vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11)); + vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9)); + vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7)); + vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5)); + vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3)); + vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1)); + vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15)); + vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13)); + vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9)); + vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5)); + vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3)); + vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo, + d0_hi, d1_lo, d1_hi, d2_lo, d2_hi; + uint8x16x2_t d02_hi, d02_lo; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(azl0, l0); + d0_hi = vrhaddq_u8(l15, l16); + + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The highest lane of d2_hi is unused. + d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8)); + d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8)); + + d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8)); + d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8)); + + // d02_hi.val[0][0] is unused here. + d02_hi = vzipq_u8(d2_hi, d0_hi); + d02_lo = vzipq_u8(d2_lo, d0_lo); + + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1; + (void)above; + + // We need the low half lanes here for the c0/c1 arithmetic but the high half + // lanes for the ext: + // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ] + l0 = load_replicate_u8_4x1(left + 0); + l3 = vld1_dup_u8(left + 3); + + // [ left[1], left[2], left[3], left[3], x, x, x, x ] + l1 = vext_u8(l0, l3, 5); + // [ left[2], left[3], left[3], left[3], x, x, x, x ] + l2 = vext_u8(l0, l3, 6); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ] + c01 = vzip_u8(c0, c1).val[0]; + + d0 = c01; + d1 = vext_u8(c01, l3, 2); + + // Store the high half of the vector for stride={2,3} to avoid needing + // additional ext instructions: + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1_high(dst + 2 * stride, d0); + store_u8_4x1_high(dst + 3 * stride, d1); +} + +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1_u8(left + 0); + l7 = vld1_dup_u8(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vext_u8(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vext_u8(l0, l7, 2); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + c01_lo = vzip_u8(c0, c1).val[0]; + c01_hi = vzip_u8(c0, c1).val[1]; + + vst1_u8(dst + 0 * stride, c01_lo); + vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2)); + vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4)); + vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6)); + vst1_u8(dst + 4 * stride, c01_hi); + vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2)); + vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6)); +} + +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1q_u8(left + 0); + l15 = vld1q_dup_u8(left + 15); + + l1 = vextq_u8(l0, l15, 1); + l2 = vextq_u8(l0, l15, 2); + + c0 = vrhaddq_u8(l0, l1); + c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1); + + c01_lo = vzipq_u8(c0, c1).val[0]; + c01_hi = vzipq_u8(c0, c1).val[1]; + + vst1q_u8(dst + 0 * stride, c01_lo); + vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4)); + vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6)); + vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8)); + vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10)); + vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12)); + vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14)); + vst1q_u8(dst + 8 * stride, c01_hi); + vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2)); + vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4)); + vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8)); + vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12)); + vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14)); +} + +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo, + c1_hi, c01[4]; + (void)above; + + l0_lo = vld1q_u8(left + 0); + l0_hi = vld1q_u8(left + 16); + l31 = vld1q_dup_u8(left + 31); + + l1_lo = vextq_u8(l0_lo, l0_hi, 1); + l1_hi = vextq_u8(l0_hi, l31, 1); + l2_lo = vextq_u8(l0_lo, l0_hi, 2); + l2_hi = vextq_u8(l0_hi, l31, 2); + + c0_lo = vrhaddq_u8(l0_lo, l1_lo); + c0_hi = vrhaddq_u8(l0_hi, l1_hi); + c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo); + c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi); + + c01[0] = vzipq_u8(c0_lo, c1_lo).val[0]; + c01[1] = vzipq_u8(c0_lo, c1_lo).val[1]; + c01[2] = vzipq_u8(c0_hi, c1_hi).val[0]; + c01[3] = vzipq_u8(c0_hi, c1_hi).val[1]; + + vst1q_u8(dst + 0 * stride + 0, c01[0]); + vst1q_u8(dst + 0 * stride + 16, c01[1]); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 8 * stride + 0, c01[1]); + vst1q_u8(dst + 8 * stride + 16, c01[2]); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 16 * stride + 0, c01[2]); + vst1q_u8(dst + 16 * stride + 16, c01[3]); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 24 * stride + 0, c01[3]); + vst1q_u8(dst + 24 * stride + 16, l31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 25 * stride + 16, l31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 26 * stride + 16, l31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 27 * stride + 16, l31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 28 * stride + 16, l31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 29 * stride + 16, l31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 30 * stride + 16, l31); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 31 * stride + 16, l31); +} + +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 19cfc7c7f..586bfb85a 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -102,6 +102,16 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { #endif } +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) { + vst1q_s32(buf, a); +} + +static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) { + return vld1q_s32(buf); +} +#endif + // Propagate type information to the compiler. Without this the compiler may // assume the required alignment of uint32_t (4 bytes) and add alignment hints // to the memory access. @@ -112,6 +122,34 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { memcpy(buf, &a, 4); } +// Load 4 contiguous bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(0); + a_u32 = vset_lane_u32(a, a_u32, 0); + return vreinterpret_u8_u32(a_u32); +} + +// Load 4 contiguous bytes and replicate across a vector when alignment is not +// guaranteed. +static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) { + uint32_t a; + memcpy(&a, buf, 4); + return vreinterpret_u8_u32(vdup_n_u32(a)); +} + +// Store 4 contiguous bytes from the low half of an 8x8 vector. +static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0); +} + +// Store 4 contiguous bytes from the high half of an 8x8 vector. +static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { @@ -126,6 +164,29 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, return vreinterpret_u8_u32(a_u32); } +// Load 8 bytes when alignment is not guaranteed. +static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { + uint64_t a; + uint64x1_t a_u64 = vdup_n_u64(0); + memcpy(&a, buf, 8); + a_u64 = vset_lane_u64(a, a_u64, 0); + return vreinterpret_u16_u64(a_u64); +} + +// Load 2 sets of 8 bytes when alignment is not guaranteed. +static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, + ptrdiff_t stride) { + uint64_t a; + uint64x2_t a_u64; + if (stride == 4) return vld1q_u16(buf); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vdupq_n_u64(a); + memcpy(&a, buf, 8); + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + // Store 2 sets of 4 bytes when alignment is not guaranteed. static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { @@ -202,6 +263,16 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { vst1_lane_u32((uint32_t *)buf, a_u32, 1); } +static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); +} + static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3) { @@ -226,6 +297,16 @@ static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, vst1_u8(s, s3); } +static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); +} + static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3) { @@ -358,4 +439,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, vst1q_u8(s, s7); } +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 9c227d560..e2351fa2c 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,8 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -69,20 +71,19 @@ quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; + int16_t const *iscan = scan_order->iscan; // Only the first element of each vector is DC. - int16x8_t zbin = vld1q_s16(zbin_ptr); - int16x8_t round = vld1q_s16(round_ptr); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vld1q_s16(mb_plane->zbin); + int16x8_t round = vld1q_s16(mb_plane->round); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -132,7 +133,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -142,10 +143,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; +#endif // VPX_ARCH_AARCH64 } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -213,23 +211,21 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -276,7 +272,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -286,9 +282,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)n_coeffs; - (void)scan; +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5fc621aee..713eec7a9 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -17,633 +17,212 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, - const void *const buf1) { - uint32_t a; - uint32x2_t aa; - memcpy(&a, buf0, 4); - aa = vdup_n_u32(a); - memcpy(&a, buf1, 4); - aa = vset_lane_u32(a, aa, 1); - return vreinterpret_u8_u32(aa); -} - -static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, - const uint8_t *const ref_array[4], - const int ref_stride, const int height, - uint32_t sad_array[4]) { - int i; - uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; -#if !defined(__aarch64__) - uint16x4_t a[2]; -#endif - uint32x4_t r; - - assert(!((intptr_t)src_ptr % sizeof(uint32_t))); - assert(!(src_stride % sizeof(uint32_t))); - - for (i = 0; i < height; ++i) { - const uint8x8_t s = vreinterpret_u8_u32( - vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride))); - const uint8x8_t ref01 = load_unaligned_2_buffers( - ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride); - const uint8x8_t ref23 = load_unaligned_2_buffers( - ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride); - abs[0] = vabal_u8(abs[0], s, ref01); - abs[1] = vabal_u8(abs[1], s, ref23); - } - -#if defined(__aarch64__) - abs[0] = vpaddq_u16(abs[0], abs[1]); - r = vpaddlq_u16(abs[0]); -#else - a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); - a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); - r = vpaddlq_u16(vcombine_u16(a[0], a[1])); -#endif - vst1q_u32(sad_array, r); -} - -void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array); -} - -void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array); -} - -//////////////////////////////////////////////////////////////////////////////// - -// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) -static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint16x8_t b0 = vpaddq_u16(a0, a1); - const uint32x4_t r = vpaddlq_u16(b0); -#else - const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint16x4_t b0 = vpadd_u16(a0, a1); - const uint16x4_t b1 = vpadd_u16(a2, a3); - const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); -#endif - vst1q_u32(sad_array, r); -} - -#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) - -// Can handle 1024 pixels' sad sum (such as 32x32) -static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint32x4_t b0 = vpaddlq_u16(a0); - const uint32x4_t b1 = vpaddlq_u16(a1); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); - const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); - const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif -} - -// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) -static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t b0 = vpaddq_u32(a0, a1); - const uint32x4_t b1 = vpaddq_u32(a2, a3); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); - const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); - const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); - const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); - const uint32x2_t c0 = vpadd_u32(b0, b1); - const uint32x2_t c1 = vpadd_u32(b2, b3); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif -} - -// Can handle 4096 pixels' sad sum (such as 64x64) -static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x4_t c0 = vpaddq_u32(b0, b1); - const uint32x4_t c1 = vpaddq_u32(b2, b3); - const uint32x4_t r = vpaddq_u32(c0, c1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); - const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); - const uint32x2_t d0 = vpadd_u32(c0, c1); - const uint32x2_t d1 = vpadd_u32(c2, c3); - vst1q_u32(sad_array, vcombine_u32(d0, d1)); -#endif -} - -#endif - -static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i, j; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); +} + +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); +} + +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); +} + +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x8_t s = vld1_u8(src_ptr); - src_ptr += src_stride; - for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); - ref_loop[j] += ref_stride; - sum[j] = vabal_u8(sum[j], s, b_u8); - } - } + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); - sad_512_pel_final_neon(sum, sad_array); -} + i++; + } while (i < h); -void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } -void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); -} - -void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint32x4_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - const uint8x16_t diff = vabdq_u8(src_ptr, r); - *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1)); -} - -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride); - sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]); - sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]); - sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]); - sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]); - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint16x8_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r)); - *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r)); +static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, + uint16x8_t *const sad_sum) { + uint8x8_t abs_diff = vabd_u8(src, ref); + *sad_sum = vaddw_u8(*sad_sum, abs_diff); } -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr); - src_ptr += src_stride; - /* Manual unrolling here stops the compiler from getting confused. */ - sad16_neon(ref_loop[0], s, &sum[0]); - ref_loop[0] += ref_stride; - sad16_neon(ref_loop[1], s, &sum[1]); - ref_loop[1] += ref_stride; - sad16_neon(ref_loop[2], s, &sum[2]); - ref_loop[2] += ref_stride; - sad16_neon(ref_loop[3], s, &sum[3]); - ref_loop[3] += ref_stride; - } - - sad_512_pel_final_neon(sum, sad_array); -} + int i = 0; + do { + const uint8x8_t s = vld1_u8(src + i * src_stride); + sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]); + sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]); + sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]); + sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]); -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); -} + i++; + } while (i < h); -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } -void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); -} +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - const int height, uint16x8_t *const sum) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } -} + int i = 0; + do { + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); - sad_512_pel_final_neon(sum, sad_array); -} + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); - sad_1024_pel_final_neon(sum, sad_array); -} + i += 2; + } while (i < h); -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); - sad_2048_pel_final_neon(sum, sad_array); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < 32; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \ + (h)); \ } - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1, r2, r3; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0) }; - - for (i = 0; i < 64; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) + +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) + +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) + +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) + +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) + +#undef SAD_WXH_4D_NEON + +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - r2 = vpaddq_u32(sum[4], sum[5]); - r3 = vpaddq_u32(sum[6], sum[7]); - r0 = vpaddq_u32(r0, r1); - r1 = vpaddq_u32(r2, r3); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; +SAD_SKIP_WXH_4D_NEON(4, 4) +SAD_SKIP_WXH_4D_NEON(4, 8) - for (i = 0; i < 32; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } +SAD_SKIP_WXH_4D_NEON(8, 4) +SAD_SKIP_WXH_4D_NEON(8, 8) +SAD_SKIP_WXH_4D_NEON(8, 16) - sad_2048_pel_final_neon(sum, sad_array); -} +SAD_SKIP_WXH_4D_NEON(16, 8) +SAD_SKIP_WXH_4D_NEON(16, 16) +SAD_SKIP_WXH_4D_NEON(16, 32) -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0) }; - - for (i = 0; i < 64; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } +SAD_SKIP_WXH_4D_NEON(32, 16) +SAD_SKIP_WXH_4D_NEON(32, 32) +SAD_SKIP_WXH_4D_NEON(32, 64) - sad_4096_pel_final_neon(sum, sad_array); -} +SAD_SKIP_WXH_4D_NEON(64, 32) +SAD_SKIP_WXH_4D_NEON(64, 64) -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#undef SAD_SKIP_WXH_4D_NEON diff --git a/vpx_dsp/arm/sad4d_neon_dotprod.c b/vpx_dsp/arm/sad4d_neon_dotprod.c new file mode 100644 index 000000000..933fc48b8 --- /dev/null +++ b/vpx_dsp/arm/sad4d_neon_dotprod.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); +} + +static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +#define SAD_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +SAD_WXH_4D_NEON_DOTPROD(16, 8) +SAD_WXH_4D_NEON_DOTPROD(16, 16) +SAD_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_WXH_4D_NEON_DOTPROD(32, 16) +SAD_WXH_4D_NEON_DOTPROD(32, 32) +SAD_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_WXH_4D_NEON_DOTPROD(64, 32) +SAD_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_4D_NEON_DOTPROD + +#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON_DOTPROD diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index ad575d4aa..4dd87ddc0 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -17,635 +17,375 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); -#if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); - const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(dp); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - return horizontal_add_uint16x8(abs); -#endif -} +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); -uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); -#if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); - const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(prod); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - return horizontal_add_uint16x8(abs); -#endif -} + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); -uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - } + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); - return horizontal_add_uint16x8(abs); -#endif + return horizontal_add_uint32x4(sum_u32); } -uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); - const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); - const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); - const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - } +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum = vdupq_n_u32(0); - return horizontal_add_uint16x8(abs); -#endif -} + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + uint16x8_t sum0 = vpaddlq_u8(diff0); -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; -} + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); -static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); - const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); } -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x2_t prod = \ - sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x2(prod); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x2_t prod = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x2(prod); \ - } +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - src_ptr += src_stride; - ref_ptr += ref_stride; - abs = vabal_u8(abs, a_u8, b_u8); - } - return abs; -} + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); -static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - abs = vabal_u8(abs, a_u8, avg); - } - return abs; -} + } while (--i != 0); -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD8XN(4) -SAD8XN(8) -SAD8XN(16) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t src_u8 = vld1q_u8(src_ptr); - const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); - const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 16; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; -} +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ - } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - src_ptr += src_stride; - ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); - } - return abs; -} + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); -static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); - } - return abs; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD16XN(8) -SAD16XN(16) -SAD16XN(32) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); - } - return prod; +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + + sum = vabal_u8(sum, s, r); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 32; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); +#define SAD_WXH_NEON(w, h) \ + unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } - return prod; -} -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) + +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) + +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) + +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) + +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ } -#else // defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) + +#undef SAD_SKIP_WXH_NEON + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + src_ptr += src_stride; ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); - } - return abs; + second_pred += 64; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); } -static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 32; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); - } - return abs; -} + } while (--i != 0); -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD32XN(16) -SAD32XN(32) -SAD32XN(64) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + return horizontal_add_uint32x4(sum); } -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 64; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + src_ptr += src_stride; ref_ptr += ref_stride; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); - } + second_pred += 8; + } while (--i != 0); - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 64; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); - } +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - -#define SAD64XN(n) \ - uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t abs = \ - sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(abs); \ - } \ - \ - uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t abs = \ - sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(abs); \ + +#define SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } -SAD64XN(32) -SAD64XN(64) +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) + +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) + +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) + +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) + +#undef SAD_WXH_AVG_NEON diff --git a/vpx_dsp/arm/sad_neon_dotprod.c b/vpx_dsp/arm/sad_neon_dotprod.c new file mode 100644 index 000000000..fbc0b8d75 --- /dev/null +++ b/vpx_dsp/arm/sad_neon_dotprod.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON_DOTPROD(16, 8) +SAD_WXH_NEON_DOTPROD(16, 16) +SAD_WXH_NEON_DOTPROD(16, 32) + +SAD_WXH_NEON_DOTPROD(32, 16) +SAD_WXH_NEON_DOTPROD(32, 32) +SAD_WXH_NEON_DOTPROD(32, 64) + +SAD_WXH_NEON_DOTPROD(64, 32) +SAD_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_NEON_DOTPROD + +#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_NEON_DOTPROD + +static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h, second_pred); +} + +static INLINE unsigned int sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h, second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON_DOTPROD(16, 8) +SAD_WXH_AVG_NEON_DOTPROD(16, 16) +SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +SAD_WXH_AVG_NEON_DOTPROD(32, 16) +SAD_WXH_AVG_NEON_DOTPROD(32, 32) +SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +SAD_WXH_AVG_NEON_DOTPROD(64, 32) +SAD_WXH_AVG_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_AVG_NEON_DOTPROD diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c new file mode 100644 index 000000000..f686dc350 --- /dev/null +++ b/vpx_dsp/arm/sse_neon.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); + uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); +} + +static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x4_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x4_t sse = vdupq_n_u32(0); + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + j += 8; + } while (j < width); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + } + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_8x1_neon(src, ref, &sse); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(sse); +} + +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int width, int height) { + switch (width) { + case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); + } +} diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c new file mode 100644 index 000000000..877777391 --- /dev/null +++ b/vpx_dsp/arm/sse_neon_dotprod.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + *sse = vdotq_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x2_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x2_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j < width); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = height; + do { + sse_8x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_8x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse = vdup_n_u32(0); + + int i = height; + do { + sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x2(sse); +} + +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int width, + int height) { + switch (width) { + case 4: + return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 8: + return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 16: + return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 32: + return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 64: + return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 9328c3ed8..d92f1615d 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -143,59 +143,58 @@ static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ - unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, unsigned int *sse) { \ - if (xoffset == 0) { \ - if (yoffset == 0) { \ - return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \ - sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp[w * h]; \ - var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ - yoffset); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ - } \ - } else if (xoffset == 4) { \ - uint8_t tmp0[w * (h + padding)]; \ - if (yoffset == 0) { \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp1[w * (h + padding)]; \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ - var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp1[w * (h + padding)]; \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ - var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } \ - } else { \ - uint8_t tmp0[w * (h + padding)]; \ - if (yoffset == 0) { \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp1[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ - xoffset); \ - var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp1[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ - xoffset); \ - var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } \ - } \ +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ } // 4x<h> blocks are processed two rows at a time, so require an extra row of @@ -418,53 +417,53 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, uint8_t tmp[w * h]; \ if (yoffset == 0) { \ avg_pred(src, tmp, source_stride, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ source_stride, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else { \ avg_pred_var_filter_block2d_bil_w##w( \ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } else { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ xoffset, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } \ } diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 9a7c424e8..11821dc10 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -16,8 +16,51 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vget_lane_u16(d, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u8(a); +#else + const uint16x8_t b = vpaddlq_u8(a); + const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b)); + const uint16x4_t d = vpadd_u16(c, c); + const uint16x4_t e = vpadd_u16(d, d); + return vget_lane_u16(e, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_u16(a); +#else + const uint16x4_t b = vpadd_u16(a, a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s16(a); #else const int32x4_t b = vpaddlq_s16(a); @@ -29,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { } static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u16(a); #else const uint32x4_t b = vpaddlq_u16(a); @@ -40,8 +83,67 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { +#if VPX_ARCH_AARCH64 + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + +static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if VPX_ARCH_AARCH64 + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_s32(a); #else return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); @@ -49,15 +151,16 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { } static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_u32(a); #else - return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); #endif } static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_s32(a); #else const int64x2_t b = vpaddlq_s32(a); @@ -68,7 +171,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { } static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); @@ -77,4 +180,96 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { return vget_lane_u32(c, 0); #endif } + +static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { +#if VPX_ARCH_AARCH64 + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); +#endif +} + +static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_s64(a); +#else + return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) { + return horizontal_long_add_uint32x4(a[0]) + + horizontal_long_add_uint32x4(a[1]); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) { + uint64x2_t sum = vpaddlq_u32(a[0]); + sum = vpadalq_u32(sum, a[1]); + sum = vpadalq_u32(sum, a[2]); + sum = vpadalq_u32(sum, a[3]); + + return horizontal_add_uint64x2(sum); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +static INLINE uint64_t +horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + sum[0] = vpadalq_u32(sum[0], a[8]); + sum[1] = vpadalq_u32(sum[1], a[9]); + sum[0] = vpadalq_u32(sum[0], a[10]); + sum[1] = vpadalq_u32(sum[1], a[11]); + sum[0] = vpadalq_u32(sum[0], a[12]); + sum[1] = vpadalq_u32(sum[1], a[13]); + sum[0] = vpadalq_u32(sum[0], a[14]); + sum[1] = vpadalq_u32(sum[1], a[15]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c index cfefad993..074afe325 100644 --- a/vpx_dsp/arm/sum_squares_neon.c +++ b/vpx_dsp/arm/sum_squares_neon.c @@ -9,77 +9,92 @@ */ #include <arm_neon.h> - #include <assert.h> + #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { - uint64x1_t s2; - if (size == 4) { int16x4_t s[4]; - int32x4_t s0; - uint32x2_t s1; + int32x4_t sum_s32; s[0] = vld1_s16(src + 0 * stride); s[1] = vld1_s16(src + 1 * stride); s[2] = vld1_s16(src + 2 * stride); s[3] = vld1_s16(src + 3 * stride); - s0 = vmull_s16(s[0], s[0]); - s0 = vmlal_s16(s0, s[1], s[1]); - s0 = vmlal_s16(s0, s[2], s[2]); - s0 = vmlal_s16(s0, s[3], s[3]); - s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)), - vget_high_u32(vreinterpretq_u32_s32(s0))); - s2 = vpaddl_u32(s1); + + sum_s32 = vmull_s16(s[0], s[0]); + sum_s32 = vmlal_s16(sum_s32, s[1], s[1]); + sum_s32 = vmlal_s16(sum_s32, s[2], s[2]); + sum_s32 = vmlal_s16(sum_s32, s[3], s[3]); + + return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32)); } else { - int r = size; - uint64x2_t s1 = vdupq_n_u64(0); + uint64x2_t sum_u64 = vdupq_n_u64(0); + int rows = size; do { - int c = size; - int32x4_t s0 = vdupq_n_s32(0); - const int16_t *src_t = src; + const int16_t *src_ptr = src; + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int cols = size; do { int16x8_t s[8]; - s[0] = vld1q_s16(src_t + 0 * stride); - s[1] = vld1q_s16(src_t + 1 * stride); - s[2] = vld1q_s16(src_t + 2 * stride); - s[3] = vld1q_s16(src_t + 3 * stride); - s[4] = vld1q_s16(src_t + 4 * stride); - s[5] = vld1q_s16(src_t + 5 * stride); - s[6] = vld1q_s16(src_t + 6 * stride); - s[7] = vld1q_s16(src_t + 7 * stride); - s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0])); - s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1])); - s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2])); - s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3])); - s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4])); - s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5])); - s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6])); - s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7])); - s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0])); - s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1])); - s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2])); - s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3])); - s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4])); - s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5])); - s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6])); - s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7])); - src_t += 8; - c -= 8; - } while (c); + s[0] = vld1q_s16(src_ptr + 0 * stride); + s[1] = vld1q_s16(src_ptr + 1 * stride); + s[2] = vld1q_s16(src_ptr + 2 * stride); + s[3] = vld1q_s16(src_ptr + 3 * stride); + s[4] = vld1q_s16(src_ptr + 4 * stride); + s[5] = vld1q_s16(src_ptr + 5 * stride); + s[6] = vld1q_s16(src_ptr + 6 * stride); + s[7] = vld1q_s16(src_ptr + 7 * stride); - s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0))); - s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0))); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7])); + + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7])); + + src_ptr += 8; + cols -= 8; + } while (cols); + + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0])); + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1])); src += 8 * stride; - r -= 8; - } while (r); + rows -= 8; + } while (rows); - s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1)); + return horizontal_add_uint64x2(sum_u64); } - - return vget_lane_u64(s2, 0); } diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 41d44f2b1..74f85a6bb 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -23,44 +23,77 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); +#endif return b0; } static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif return b0; } static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { int64x2x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); + b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); +#else b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), vreinterpret_s64_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), vreinterpret_s64_s32(vget_high_s32(a1))); +#endif return b0; } static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u8_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u8_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), vreinterpret_u8_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)), vreinterpret_u8_u32(vget_high_u32(a1))); +#endif return b0; } static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); +#endif return b0; } @@ -141,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { // c0: 00 01 20 21 02 03 22 23 // c1: 10 11 30 31 12 13 32 33 - const int32x4_t c0 = - vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); - const int32x4_t c1 = - vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); + const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const int16x8x2_t d0 = - vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); + const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -172,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { // c0: 00 01 20 21 02 03 22 23 // c1: 10 11 30 31 12 13 32 33 - const uint32x4_t c0 = - vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); - const uint32x4_t c1 = - vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); + const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const uint16x8x2_t d0 = - vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); + const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -281,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, const int16x4_t a6, const int16x4_t a7, int16x8_t *const o0, int16x8_t *const o1, int16x8_t *const o2, int16x8_t *const o3) { - // Swap 16 bit elements. Goes from: + // Combine rows. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 @@ -291,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, // a6: 60 61 62 63 // a7: 70 71 72 73 // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - // b2.val[0]: 40 50 42 52 - // b2.val[1]: 41 51 43 53 - // b3.val[0]: 60 70 62 72 - // b3.val[1]: 61 71 63 73 + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 + + const int16x8_t b0 = vcombine_s16(a0, a4); + const int16x8_t b1 = vcombine_s16(a1, a5); + const int16x8_t b2 = vcombine_s16(a2, a6); + const int16x8_t b3 = vcombine_s16(a3, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - const int16x4x2_t b2 = vtrn_s16(a4, a5); - const int16x4x2_t b3 = vtrn_s16(a6, a7); + const int16x8x2_t c0 = vtrnq_s16(b0, b1); + const int16x8x2_t c1 = vtrnq_s16(b2, b3); // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 - // c0.val[1]: 02 12 22 32 - // c1.val[0]: 01 11 21 31 - // c1.val[1]: 03 13 23 33 - // c2.val[0]: 40 50 60 70 - // c2.val[1]: 42 52 62 72 - // c3.val[0]: 41 51 61 71 - // c3.val[1]: 43 53 63 73 + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), - vreinterpret_s32_s16(b3.val[0])); - const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), - vreinterpret_s32_s16(b3.val[1])); + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); - // Swap 64 bit elements resulting in: - // o0: 00 10 20 30 40 50 60 70 - // o1: 01 11 21 31 41 51 61 71 - // o2: 02 12 22 32 42 52 62 72 - // o3: 03 13 23 33 43 53 63 73 - - *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), - vreinterpret_s16_s32(c2.val[0])); - *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), - vreinterpret_s16_s32(c3.val[0])); - *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), - vreinterpret_s16_s32(c2.val[1])); - *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), - vreinterpret_s16_s32(c3.val[1])); + *o0 = vreinterpretq_s16_s32(d0.val[0]); + *o1 = vreinterpretq_s16_s32(d1.val[0]); + *o2 = vreinterpretq_s16_s32(d0.val[1]); + *o3 = vreinterpretq_s16_s32(d1.val[1]); } static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, @@ -569,37 +581,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, } // Transpose 8x8 to a new location. -static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; } static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, @@ -658,6 +706,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); @@ -729,6 +778,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); @@ -866,6 +916,68 @@ static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, out_right[7] = out[7].val[1]; } +static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + int32x4_t tl[16], tr[16]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + tl[0] = left1[8]; + tl[1] = left1[9]; + tl[2] = left1[10]; + tl[3] = left1[11]; + tl[4] = left1[12]; + tl[5] = left1[13]; + tl[6] = left1[14]; + tl[7] = left1[15]; + tr[0] = right1[8]; + tr[1] = right1[9]; + tr[2] = right1[10]; + tr[3] = right1[11]; + tr[4] = right1[12]; + tr[5] = right1[13]; + tr[6] = right1[14]; + tr[7] = right1[15]; + + left1[8] = left2[0]; + left1[9] = left2[1]; + left1[10] = left2[2]; + left1[11] = left2[3]; + left1[12] = left2[4]; + left1[13] = left2[5]; + left1[14] = left2[6]; + left1[15] = left2[7]; + right1[8] = right2[0]; + right1[9] = right2[1]; + right1[10] = right2[2]; + right1[11] = right2[3]; + right1[12] = right2[4]; + right1[13] = right2[5]; + right1[14] = right2[6]; + right1[15] = right2[7]; + + left2[0] = tl[0]; + left2[1] = tl[1]; + left2[2] = tl[2]; + left2[3] = tl[3]; + left2[4] = tl[4]; + left2[5] = tl[5]; + left2[6] = tl[6]; + left2[7] = tl[7]; + right2[0] = tr[0]; + right2[1] = tr[1]; + right2[2] = tr[2]; + right2[3] = tr[3]; + right2[4] = tr[4]; + right2[5] = tr[5]; + right2[6] = tr[6]; + right2[7] = tr[7]; + + transpose_s32_8x8_2(left1, right1, left1, right1); + transpose_s32_8x8_2(left2, right2, left2, right2); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8); +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 3ccc4e807..efb2c1d8d 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -19,143 +19,6 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -#if defined(__ARM_FEATURE_DOTPROD) - -// Process a block of width 4 four rows at a time. -static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - i -= 4; - } while (i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of width 8 two rows at a time. -static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = - vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); - const uint8x16_t r = - vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += 2 * src_stride; - ref_ptr += 2 * ref_stride; - i -= 2; - } while (i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of width 16 one row at a time. -static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = vld1q_u8(src_ptr); - const uint8x16_t r = vld1q_u8(ref_ptr); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of any size where the width is divisible by 16. -static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - int j = 0; - do { - const uint8x16_t s = vld1q_u8(src_ptr + j); - const uint8x16_t r = vld1q_u8(ref_ptr + j); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - j += 16; - } while (j < w); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int h, - uint32_t *sse, int *sum) { - variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); -} - -static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int h, - uint32_t *sse, int *sum) { - variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - // Process a block of width 4 two rows at a time. static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -328,8 +191,6 @@ static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { @@ -369,165 +230,103 @@ VARIANCE_WXH_NEON(32, 64, 11) VARIANCE_WXH_NEON(64, 32, 11) VARIANCE_WXH_NEON(64, 64, 12) -#if defined(__ARM_FEATURE_DOTPROD) +#undef VARIANCE_WXH_NEON + +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2], abs_diff[2]; - uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + uint16x8_t sse0, sse1; - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); + s0 = vld1_u8(src_ptr); src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s1 = vld1_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); + r0 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r1 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - abs_diff[0] = vabdq_u8(a[0], b[0]); - abs_diff[1] = vabdq_u8(a[1], b[1]); + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); - sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]); - sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]); - } + sse0 = vmull_u8(diff0, diff0); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(diff1, diff1); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); - return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t a[4], b[4], abs_diff[4]; - uint32x2_t sse = vdup_n_u32(0); - - a[0] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[0] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[1] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[1] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[2] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[2] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[3] = vld1_u8(src_ptr); - b[3] = vld1_u8(ref_ptr); - - abs_diff[0] = vabd_u8(a[0], b[0]); - abs_diff[1] = vabd_u8(a[1], b[1]); - abs_diff[2] = vabd_u8(a[2], b[2]); - abs_diff[3] = vabd_u8(a[3], b[3]); - - sse = vdot_u32(sse, abs_diff[0], abs_diff[0]); - sse = vdot_u32(sse, abs_diff[1], abs_diff[1]); - sse = vdot_u32(sse, abs_diff[2], abs_diff[2]); - sse = vdot_u32(sse, abs_diff[3], abs_diff[3]); - - return vget_lane_u32(sse, 0); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2]; - int16x4_t diff_lo[4], diff_hi[4]; - uint16x8_t diff[4]; - int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), - vdupq_n_s32(0) }; + int i = h; + do { + uint8x16_t s, r, diff; + uint16x8_t sse0, sse1; - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); - src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s = vld1q_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r = vld1q_u8(ref_ptr); ref_ptr += ref_stride; - diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0])); - diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0])); - diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1])); - diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1])); - - diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); - diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]); - - diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); - diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]); - - diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); - diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]); - - diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); - diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]); - } + diff = vabdq_u8(s, r); - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]); - sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]); - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]); + sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff)); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff)); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); - *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); - return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { - uint8x8_t a[4], b[4]; - int16x4_t diff_lo[4]; - uint16x8_t diff[4]; - int32x4_t sse; - - a[0] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[0] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[1] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[1] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[2] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[2] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[3] = vld1_u8(src_ptr); - b[3] = vld1_u8(ref_ptr); - - diff[0] = vsubl_u8(a[0], b[0]); - diff[1] = vsubl_u8(a[1], b[1]); - diff[2] = vsubl_u8(a[2], b[2]); - diff[3] = vsubl_u8(a[3], b[3]); - - diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0])); - diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1])); - diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2])); - diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3])); - - sse = vmull_s16(diff_lo[0], diff_lo[0]); - sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]); - sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]); - sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]); - - return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); + uint8x8_t s[2], r[2]; + uint16x8_t abs_diff[2]; + uint32x4_t sse; + + s[0] = load_u8(src_ptr, src_stride); + r[0] = load_u8(ref_ptr, ref_stride); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + s[1] = load_u8(src_ptr, src_stride); + r[1] = load_u8(ref_ptr, ref_stride); + + abs_diff[0] = vabdl_u8(s[0], r[0]); + abs_diff[1] = vabdl_u8(s[1], r[1]); + + sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1])); + + return horizontal_add_uint32x4(sse); } -#endif // defined(__ARM_FEATURE_DOTPROD) +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON(8, 8) +VPX_MSE_WXH_NEON(8, 16) +VPX_MSE_WXH_NEON(16, 8) +VPX_MSE_WXH_NEON(16, 16) + +#undef VPX_MSE_WXH_NEON diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c new file mode 100644 index 000000000..ab843e9fc --- /dev/null +++ b/vpx_dsp/arm/variance_neon_dotprod.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 four rows at a time. +static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + i -= 4; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +static INLINE void variance_32xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, + sum); +} + +static INLINE void variance_64xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, + sum); +} + +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, + sum); +} + +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, + sum); +} + +#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) +VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) + +VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) +VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) +VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) + +VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) +VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) +VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) + +VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) +VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) +VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) + +VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) +VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) + +#undef VARIANCE_WXH_NEON_DOTPROD + +static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE unsigned int vpx_mse16xh_neon_dotprod( + const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); + + return horizontal_add_uint32x4(sse); +} + +#define VPX_MSE_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_mse##w##x##h##_neon_dotprod( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr, \ + ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON_DOTPROD(8, 8) +VPX_MSE_WXH_NEON_DOTPROD(8, 16) +VPX_MSE_WXH_NEON_DOTPROD(16, 8) +VPX_MSE_WXH_NEON_DOTPROD(16, 16) + +#undef VPX_MSE_WXH_NEON_DOTPROD diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index b4cdd58c7..8b89862ba 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -17,6 +17,7 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" // Note: @@ -31,1240 +32,6 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && \ - (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { - 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, - 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ - 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ - 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ - 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 -}; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; - uint8x8_t d01, d23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_usdot(s0, filters, permute_tbl); - t1 = convolve8_4_usdot(s1, filters, permute_tbl); - t2 = convolve8_4_usdot(s2, filters, permute_tbl); - t3 = convolve8_4_usdot(s3, filters, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filters, permute_tbl); - d1 = convolve8_8_usdot(s1, filters, permute_tbl); - d2 = convolve8_8_usdot(s2, filters, permute_tbl); - d3 = convolve8_8_usdot(s3, filters, permute_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } -} - -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; - uint8x8_t d01, d23, dd01, dd23; - dd01 = vdup_n_u8(0); - dd23 = vdup_n_u8(0); - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_usdot(s0, filters, permute_tbl); - t1 = convolve8_4_usdot(s1, filters, permute_tbl); - t2 = convolve8_4_usdot(s2, filters, permute_tbl); - t3 = convolve8_4_usdot(s3, filters, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filters, permute_tbl); - d1 = convolve8_8_usdot(s1, filters, permute_tbl); - d2 = convolve8_8_usdot(s2, filters, permute_tbl); - d3 = convolve8_8_usdot(s3, filters, permute_tbl); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } -} - -static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); -} - -static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); -} - -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } -} - -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } -} - -#else // !defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; - uint8x8_t d01, d23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = - convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); - d1 = - convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); - d2 = - convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); - d3 = - convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } -} - -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; - uint8x8_t d01, d23, dd01, dd23; - dd01 = vdup_n_u8(0); - dd23 = vdup_n_u8(0); - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = - convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); - d1 = - convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); - d2 = - convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); - d3 = - convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } -} - -static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); -} - -static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); -} - -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } -} - -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } -} - -#endif // defined(__ARM_FEATURE_MATMUL_INT8) - -#else // !(defined(__aarch64__) && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8))) - void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -1273,8 +40,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -1286,25 +53,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); __builtin_prefetch(dst + 2 * dst_stride); @@ -1314,32 +78,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), - vreinterpret_u32_u8(d01), 0); - vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), - vreinterpret_u32_u8(d23), 0); - vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), - vreinterpret_u32_u8(d01), 1); - vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), - vreinterpret_u32_u8(d23), 1); + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -1355,7 +109,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } else { int width; const uint8_t *s; - uint8x8_t t4, t5, t6, t7; + uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { @@ -1395,32 +149,24 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - transpose_u8_8x4(&t0, &t1, &t2, &t3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); - dst += dst_stride; + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; int16x8_t s11, s12, s13, s14; do { @@ -1466,17 +212,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -1505,8 +252,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -1516,10 +263,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (h == 4) { - uint8x8_t d01, d23; + uint8x8_t d01, d23, dd01, dd23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; - uint32x4_t d0123 = vdupq_n_u32(0); __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -1527,17 +272,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 3 * src_stride); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); __builtin_prefetch(dst + 2 * dst_stride); @@ -1547,35 +289,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); - vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -1595,8 +330,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { - uint32x4_t d0415 = vdupq_n_u32(0); - uint32x4_t d2637 = vdupq_n_u32(0); + uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37; + do { load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -1633,48 +368,35 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - transpose_u8_8x4(&t0, &t1, &t2, &t3); - - d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); - d0415 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); - d2637 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); - - vst1q_lane_u32((uint32_t *)dst, d0415, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 3); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 3); - dst += dst_stride; + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + + d04 = vrhadd_u8(d04, dd04); + d15 = vrhadd_u8(d15, dd15); + d26 = vrhadd_u8(d26, dd26); + d37 = vrhadd_u8(d37, dd37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } else { uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; int16x8_t s11, s12, s13, s14; - uint8x16_t d01, d23, d45, d67; do { __builtin_prefetch(src + 0 * src_stride); @@ -1719,33 +441,27 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride)); + d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride)); + d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride)); + d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride)); - d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), - vld1_u8(d + 5 * dst_stride)); - d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), - vld1_u8(d + 7 * dst_stride)); - d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); - d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); - d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); - d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); - - store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), - vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), - vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -1761,7 +477,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } } } @@ -1773,8 +489,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int h) { const int16x8_t filters = vld1q_s16(filter[y0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -1784,33 +500,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -1820,21 +529,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -1843,13 +547,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { @@ -1860,33 +566,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 4 * src_stride); __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; d = dst; height = h; do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); @@ -1896,19 +595,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - vst1_u8(d, t0); - d += dst_stride; - vst1_u8(d, t1); - d += dst_stride; - vst1_u8(d, t2); - d += dst_stride; - vst1_u8(d, t3); - d += dst_stride; + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -1917,6 +610,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; @@ -1933,8 +628,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int h) { const int16x8_t filters = vld1q_s16(filter[y0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -1944,34 +639,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - uint32x4_t d0123 = vdupq_n_u32(0); - - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -1981,29 +668,22 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); - - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); - - vst1q_lane_u32((uint32_t *)dst, d0123, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 3); - dst += dst_stride; + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -2012,14 +692,15 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; - uint8x16_t d01, d23, dd01, dd23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { @@ -2030,33 +711,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 4 * src_stride); __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; d = dst; height = h; do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); @@ -2066,28 +740,18 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u8(t0, t1); - d23 = vcombine_u8(t2, t3); - dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - dd01 = vrhaddq_u8(dd01, d01); - dd23 = vrhaddq_u8(dd23, d23); - - vst1_u8(d, vget_low_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_low_u8(dd23)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd23)); - d += dst_stride; + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -2097,6 +761,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; height -= 4; + s += 4 * src_stride; + d += 4 * dst_stride; } while (height != 0); src += 8; dst += 8; @@ -2104,7 +770,3 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } while (w != 0); } } - -#endif // #if defined(__aarch64__) && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index ed7f18053..025e943cc 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -15,10 +15,18 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) -static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h); + +static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, const int8x8_t filters) { @@ -29,11 +37,11 @@ static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, sum = vdotq_lane_s32(correction, samples_lo, filters, 0); sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } -static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, const uint8x16_t range_limit, @@ -54,8 +62,8 @@ static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, @@ -78,7 +86,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, @@ -111,14 +119,20 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) -#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) -static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { /* Sample permutation is performed by the caller. */ @@ -127,11 +141,11 @@ static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } -static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { uint8x16_t permuted_samples[2]; @@ -147,8 +161,8 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, @@ -169,7 +183,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, @@ -196,10 +210,10 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, @@ -238,7 +252,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c new file mode 100644 index 000000000..bf01364cf --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -0,0 +1,777 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c new file mode 100644 index 000000000..e0e482e3f --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -0,0 +1,698 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c index 361ec8a80..bea7c9843 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -9,6 +9,7 @@ */ #include <arm_neon.h> +#include <string.h> #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" @@ -26,10 +27,10 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, if (w < 8) { // copy4 do { - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; h -= 2; diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c new file mode 100644 index 000000000..400e26b30 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. */ + vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data. */ + vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + uint8_t temp[64 * 71]; + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c new file mode 100644 index 000000000..4d94bb79b --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. */ + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, + w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data. */ + vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[64 * 71]; + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, + w, intermediate_height); + + vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 954015407..a8dcab7da 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); } - // coeff: 15 bit, dynamic range [-16320, 16320] + // coeff: 16 bit, dynamic range [-32768, 32767] for (idx = 0; idx < 256; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[256]; tran_low_t a2 = coeff[512]; tran_low_t a3 = coeff[768]; - tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range - tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] tran_low_t b3 = (a2 - a3) >> 2; - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] coeff[256] = b1 + b3; coeff[512] = b0 - b2; coeff[768] = b1 - b3; @@ -428,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int i, j; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); const uint16_t *d = CONVERT_TO_SHORTPTR(d8); - *min = 255; + *min = 65535; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index 77be0bb4f..9bb1691e2 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -11,6 +11,8 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, __m128i round, __m128i quant, @@ -88,15 +90,15 @@ static INLINE int16_t accumulate_eob(__m128i eob) { } #if !CONFIG_VP9_HIGHBITDEPTH + void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { __m128i zero = __lsx_vldi(0); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, quant_shift; __m128i coeff0, coeff1; @@ -104,13 +106,11 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - - zbin = __lsx_vld(zbin_ptr, 0); - round = __lsx_vld(round_ptr, 0); - quant = __lsx_vld(quant_ptr, 0); + zbin = __lsx_vld(mb_plane->zbin, 0); + round = __lsx_vld(mb_plane->round, 0); + quant = __lsx_vld(mb_plane->quant, 0); dequant = __lsx_vld(dequant_ptr, 0); - quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); // Handle one DC and first 15 AC. DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); qcoeff0 = __lsx_vabsd_h(coeff0, zero); @@ -167,31 +167,27 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, +void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m128i zero = __lsx_vldi(0); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, quant_shift; __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - zbin = __lsx_vld(zbin_ptr, 0); + zbin = __lsx_vld(mb_plane->zbin, 0); zbin = __lsx_vsrari_h(zbin, 1); - round = __lsx_vld(round_ptr, 0); + round = __lsx_vld(mb_plane->round, 0); round = __lsx_vsrari_h(round, 1); - quant = __lsx_vld(quant_ptr, 0); + quant = __lsx_vld(mb_plane->quant, 0); dequant = __lsx_vld(dequant_ptr, 0); - quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); quant_shift = __lsx_vslli_h(quant_shift, 1); // Handle one DC and first 15 AC. DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index d54ce5368..53462b59f 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -774,16 +774,16 @@ Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_SW_S32(in) \ - ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m; \ + int32_t hadd_sw_s32_sum_m; \ + \ + hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1); \ + hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m; \ + hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \ + hadd_sw_s32_sum_m; \ }) /* Description : Horizontal addition of 4 unsigned word elements @@ -793,16 +793,16 @@ Details : 4 unsigned word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UW_U32(in) \ - ({ \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m += res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_UW_U32(in) \ + ({ \ + v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m; \ + uint32_t hadd_uw_u32_sum_m; \ + \ + hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ + hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \ + hadd_uw_u32_res0_m += hadd_uw_u32_res1_m; \ + hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0); \ + hadd_uw_u32_sum_m; \ }) /* Description : Horizontal addition of 8 unsigned halfword elements @@ -812,14 +812,14 @@ Details : 8 unsigned halfword elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ - ({ \ - v4u32 res_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - sum_m = HADD_UW_U32(res_m); \ - sum_m; \ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 hadd_uh_u32_res_m; \ + uint32_t hadd_uh_u32_sum_m; \ + \ + hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m); \ + hadd_uh_u32_sum_m; \ }) /* Description : Horizontal addition of unsigned byte vector elements diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c index be9614a35..6c6bc9a30 100644 --- a/vpx_dsp/ppc/variance_vsx.c +++ b/vpx_dsp/ppc/variance_vsx.c @@ -225,7 +225,7 @@ static INLINE void variance(const uint8_t *src_ptr, int src_stride, } /* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in * variable. */ #define MSE(W, H) \ diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c index f0d4e927a..4ee4130a2 100644 --- a/vpx_dsp/psnr.c +++ b/vpx_dsp/psnr.c @@ -45,14 +45,14 @@ static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, } #if CONFIG_VP9_HIGHBITDEPTH -static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h) { +static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { @@ -88,10 +88,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; - unsigned int sse; for (x = 0; x < width / 16; ++x) { - vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; @@ -131,21 +129,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; if (dw > 0) { - total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height); + total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh); + total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; - unsigned int sse; for (x = 0; x < width / 16; ++x) { - vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; } diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h index 9ebb64dd5..7c57aa429 100644 --- a/vpx_dsp/psnr.h +++ b/vpx_dsp/psnr.h @@ -26,7 +26,7 @@ typedef struct vpx_psnr_pkt PSNR_STATS; /*!\brief Converts SSE to PSNR * - * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). * * \param[in] samples Number of samples * \param[in] peak Max sample value diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64a8..fac9136f8 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,8 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -114,15 +116,17 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, #endif void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -162,16 +166,17 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -208,21 +213,23 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const struct ScanOrder *const scan_order) { + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -269,19 +276,21 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[1024]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index b47c43430..2a4c81d58 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -40,9 +40,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, unsigned int vpx_sad##m##x##n##_avg_c( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ return sad(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_sad_skip_##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \ + (n / 2)); \ } // Compare |src_ptr| to 4 distinct references in |ref_array[4]| @@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, for (i = 0; i < 4; ++i) \ sad_array[i] = \ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ + } \ + void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ } /* clang-format off */ @@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } #define highbd_sadMxNx4D(m, n) \ @@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ ref_array[i], ref_stride); \ } \ + } \ + void vpx_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \ + src, src_stride, ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c new file mode 100644 index 000000000..6cb4b705f --- /dev/null +++ b/vpx_dsp/sse.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Sum the square of the difference between every corresponding element of the + * buffers. + */ + +#include <stdlib.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" + +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index ce1e8382b..1c476542f 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass( const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + DECLARE_ALIGNED(32, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ W, bilinear_filters[x_offset]); \ @@ -180,7 +180,7 @@ static void var_filter_block2d_bil_second_pass( } /* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in * variable. */ #define MSE(W, H) \ diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 755cb907d..ccdb2f90b 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -69,11 +69,15 @@ typedef struct variance_vtable { #if CONFIG_VP9 typedef struct vp9_variance_vtable { vpx_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + vpx_sad_fn_t sdsf; vpx_sad_avg_fn_t sdaf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; vpx_sad_multi_d_fn_t sdx4df; + // Same as sadx4, but downsample the rows by a factor of 2. + vpx_sad_multi_d_fn_t sdsx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 1fd9495cf..93abf39ff 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -31,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c DSP_SRCS-yes += bitwriter_buffer.h DSP_SRCS-yes += psnr.c DSP_SRCS-yes += psnr.h +DSP_SRCS-yes += sse.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c +DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c endif ifeq ($(CONFIG_DECODERS),yes) @@ -133,6 +138,10 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -252,6 +261,7 @@ DSP_SRCS-yes += inv_txfm.h DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c @@ -342,6 +352,10 @@ DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_neon.c +endif DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c ifeq ($(VPX_ARCH_X86_64),yes) @@ -364,7 +378,9 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c @@ -392,6 +408,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c @@ -406,6 +423,7 @@ DSP_SRCS-yes += variance.h DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/variance_neon_dotprod.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c @@ -418,6 +436,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_pred_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c @@ -432,7 +451,10 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 2de449546..4b946d756 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -45,9 +45,21 @@ typedef int16_t tran_low_t; typedef int16_t tran_coef_t; +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code for clip_pixel() when the return type is uint8_t. +// See: +// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +static INLINE int clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#else static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } +#endif static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8725821b6..e9d63f6ef 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,6 +17,10 @@ print <<EOF #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif EOF } @@ -38,7 +42,7 @@ if ($opts{arch} eq "x86_64") { # add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_4x4 sse2/; +specialize qw/vpx_d207_predictor_4x4 neon sse2/; add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_4x4 neon sse2/; @@ -46,7 +50,7 @@ specialize qw/vpx_d45_predictor_4x4 neon sse2/; add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_4x4 ssse3/; +specialize qw/vpx_d63_predictor_4x4 neon ssse3/; add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; @@ -57,12 +61,13 @@ specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/; add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_4x4 neon/; add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_4x4 ssse3/; +specialize qw/vpx_d153_predictor_4x4 neon ssse3/; add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_4x4 neon msa sse2/; @@ -86,7 +91,7 @@ add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, co specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/; add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_8x8 ssse3/; +specialize qw/vpx_d207_predictor_8x8 neon ssse3/; add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. @@ -94,19 +99,20 @@ specialize qw/vpx_d45_predictor_8x8 neon sse2/; add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. -specialize qw/vpx_d63_predictor_8x8 ssse3/; +specialize qw/vpx_d63_predictor_8x8 neon ssse3/; add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_8x8 neon/; add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_8x8 ssse3/; +specialize qw/vpx_d153_predictor_8x8 neon ssse3/; add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_8x8 neon msa sse2/; @@ -129,24 +135,25 @@ add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, co specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/; add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_16x16 ssse3/; +specialize qw/vpx_d207_predictor_16x16 neon ssse3/; add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/; +specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/; add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_16x16 neon/; add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_16x16 ssse3/; +specialize qw/vpx_d153_predictor_16x16 neon ssse3/; add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; @@ -167,24 +174,25 @@ add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_32x32 ssse3/; +specialize qw/vpx_d207_predictor_32x32 neon ssse3/; add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/; +specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/; add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_32x32 neon/; add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_32x32 ssse3/; +specialize qw/vpx_d153_predictor_32x32 neon ssse3/; add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; @@ -207,25 +215,25 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; @@ -246,25 +254,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; @@ -285,25 +293,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; @@ -324,25 +332,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; @@ -374,22 +382,22 @@ add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3 neon msa/; @@ -589,7 +597,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct8x8_1 sse2 neon msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 neon sse2 msa lsx/; + specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct16x16_1 sse2 neon msa/; @@ -633,12 +641,12 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_idct8x8_64_add neon sse2 vsx/; specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; specialize qw/vpx_idct8x8_1_add neon sse2/; - specialize qw/vpx_idct16x16_256_add neon sse2 vsx/; + specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/; specialize qw/vpx_idct16x16_38_add neon sse2/; specialize qw/vpx_idct16x16_10_add neon sse2/; specialize qw/vpx_idct16x16_1_add neon sse2/; - specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/; - specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/; + specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; specialize qw/vpx_iwht4x4_16_add sse2 vsx/; @@ -714,17 +722,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Quantization # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { - add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER @@ -736,32 +744,35 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; +add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; +specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/; + # # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x64 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; @@ -778,6 +789,45 @@ specialize qw/vpx_sad4x8 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x4 neon msa sse2 mmi/; +add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x16 neon sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x8 neon sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x4 neon/; + +add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_4x8 neon sse2/; + +add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_4x4 neon/; + # # Avg # @@ -802,19 +852,19 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_8x8 avx2/; + specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_16x16 avx2/; + specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_32x32 avx2/; + specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length"; - specialize qw/vpx_highbd_satd avx2/; + specialize qw/vpx_highbd_satd avx2 neon/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64"; @@ -830,38 +880,37 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { } add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height"; - specialize qw/vpx_int_pro_row sse2 neon msa/; - + specialize qw/vpx_int_pro_row neon sse2 msa/; add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width"; - specialize qw/vpx_int_pro_col sse2 neon msa/; + specialize qw/vpx_int_pro_col neon sse2 msa/; add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; specialize qw/vpx_vector_var neon sse2 msa/; } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/; @@ -881,45 +930,84 @@ specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; +add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx2 sse2/; + +add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/; + +add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/; + +add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/; + +add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/; + +add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x16x4d neon sse2/; + +add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x8x4d neon sse2/; + +add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x4x4d neon/; + +add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_4x8x4d neon sse2/; + +add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_4x4x4d neon/; + add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; @@ -941,6 +1029,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/vpx_highbd_subtract_block neon avx2/; + add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/vpx_highbd_sse sse4_1 avx2 neon/; + # # Single block SAD # @@ -983,16 +1074,56 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad4x4 neon/; + add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x4 neon/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_4x8 neon/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_4x4 neon/; + # # Avg # add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_8x8 sse2/; + specialize qw/vpx_highbd_avg_8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_4x4 sse2/; + specialize qw/vpx_highbd_avg_4x4 sse2 neon/; add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; + specialize qw/vpx_highbd_minmax_8x8 neon/; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; @@ -1036,45 +1167,84 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x8x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x4x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x8x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; + add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x4x4d neon/; + + add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/; + + add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_4x4x4d neon/; + # # Structured Similarity (SSIM) # @@ -1090,73 +1260,73 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq " # Variance # add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/; # # Specialty Variance # add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/; + specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/; add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get8x8var sse2 neon msa vsx/; + specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; + specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; - specialize qw/vpx_get4x4sse_cs neon msa vsx/; + specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; + specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/; # # Subpixel Variance diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index b2e01319d..61e4e73c5 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, } #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = @@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); - __m256i b0 = _mm256_add_epi16(coeff0, coeff1); - __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); - __m256i b2 = _mm256_add_epi16(coeff2, coeff3); - __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); - b0 = _mm256_srai_epi16(b0, 2); - b1 = _mm256_srai_epi16(b1, 2); - b2 = _mm256_srai_epi16(b2, 2); - b3 = _mm256_srai_epi16(b3, 2); + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 015c11a1f..4447dfab7 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -15,6 +15,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; @@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); - b0 = _mm_srai_epi16(b0, 2); - b1 = _mm_srai_epi16(b1, 2); - b2 = _mm_srai_epi16(b2, 2); - b3 = _mm_srai_epi16(b3, 2); + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c new file mode 100644 index 000000000..f4357998c --- /dev/null +++ b/vpx_dsp/x86/avg_pred_avx2.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" + +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + // comp_pred and pred must be 32 byte aligned. + assert(((intptr_t)comp_pred % 32) == 0); + assert(((intptr_t)pred % 32) == 0); + + if (width == 8) { + assert(height % 4 == 0); + do { + const __m256i p = _mm256_load_si256((const __m256i *)pred); + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + const __m128i r_1 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + + const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); + const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); + const __m256i avg = _mm256_avg_epu8(p, ref_0123); + + _mm256_store_si256((__m256i *)comp_pred, avg); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + assert(height % 4 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + assert(height % 2 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + int x; + for (x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_load_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)(comp_pred + x), average_0); + _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); + } +} diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c index a2ed420e3..c8f54a49c 100644 --- a/vpx_dsp/x86/fwd_txfm_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -8,9 +8,382 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <immintrin.h> // AVX2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + #if !CONFIG_VP9_HIGHBITDEPTH #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 8edddd637..35ca55404 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,8 @@ #include <immintrin.h> #include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -26,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) { } } -static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const int16_t *quant_shift_ptr, - __m256i *qp, int log_scale) { - const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); - const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); +static VPX_FORCE_INLINE void init_qp( + const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin); + const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round); + const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); - const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + const __m128i quant_shift = + _mm_loadu_si128((const __m128i *)mb_plane->quant_shift); init_one_qp(&zbin, &qp[0]); init_one_qp(&round, &qp[1]); init_one_qp(&quant, &qp[2]); @@ -134,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp, } void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; + const int16_t *iscan = scan_order->iscan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + init_qp(mb_plane, dequant_ptr, qp, 0); quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); @@ -222,17 +219,16 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + init_qp(mb_plane, dequant_ptr, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index ae1981a83..adae60756 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,19 +15,22 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" -#if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; + const int16_t *iscan = scan_order->iscan; + const int16_t *zbin_ptr = mb_plane->zbin; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); @@ -38,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - (void)scan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); @@ -93,19 +94,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; + const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); @@ -140,14 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob; } -#endif diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 947b5e977..e483fdce7 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD64XNX4D(n) \ - void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 2); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 1; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 64x64 -HIGHBD_SAD64XNX4D(64) - -// 64x32 -HIGHBD_SAD64XNX4D(32) +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD32XNX4D(n) \ - void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 8); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 3; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 32x64 -HIGHBD_SAD32XNX4D(64) - -// 32x32 -HIGHBD_SAD32XNX4D(32) - -// 32x16 -HIGHBD_SAD32XNX4D(16) +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, } } -void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], - int ref_stride, uint32_t sad_array[4]) { +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); @@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums_32[2] = _mm256_setzero_si256(); sums_32[3] = _mm256_setzero_si256(); - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); - highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32[0] = _mm256_add_epi32( @@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { @@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } } + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm index 6c2a61e01..a07892d81 100644 --- a/vpx_dsp/x86/highbd_sad4d_sse2.asm +++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -213,7 +213,12 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad %if UNIX64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip ; set m1 push srcq @@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ pshufd m1, m1, 0x0 pop srcq +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM @@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif movifnidn r4, r4mp movu [r4], m4 RET @@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 231b67f80..78f8eb8bf 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, } } -#define HIGHBD_SAD64XN(n) \ - unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 2); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 1; \ - ref += ref_stride << 1; \ - } \ - return calc_final(sums_32); \ +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; } + return calc_final(sums_32); +} -// 64x64 -HIGHBD_SAD64XN(64) +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } -// 64x32 -HIGHBD_SAD64XN(32) +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, } } -#define HIGHBD_SAD32XN(n) \ - unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 8); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 3; \ - ref += ref_stride << 3; \ - } \ - return calc_final(sums_32); \ - } +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; -// 32x64 -HIGHBD_SAD32XN(64) + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); -// 32x32 -HIGHBD_SAD32XN(32) + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); -// 32x16 -HIGHBD_SAD32XN(16) + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { __m256i sums_16 = _mm256_setzero_si256(); - highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32 = _mm256_add_epi32( @@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, return calc_final(sums_32); } +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); @@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, } } +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + // AVG ------------------------------------------------------------------------- static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, const uint16_t *src, diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm index 6a1a6f3d6..62ad2237f 100644 --- a/vpx_dsp/x86/highbd_sad_sse2.asm +++ b/vpx_dsp/x86/highbd_sad_sse2.asm @@ -12,6 +12,11 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro HIGH_SAD_FN 4 %if %4 == 0 %if %3 == 5 @@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg +%elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 pxor m6, m6 @@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 pxor m6, m6 @@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 000000000..752435d24 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, + int stride) { + int i; + // Load 16x16 values + for (i = 0; i < 16; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); + const __m128i in1 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); + const __m128i in2 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); + const __m128i in3 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); + const __m128i ls = _mm_packs_epi32(in0, in1); + const __m128i rs = _mm_packs_epi32(in2, in3); + in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); +#else + in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); +#endif + } +} + +static INLINE __m256i dct_round_shift_avx2(__m256i in) { + const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); + return _mm256_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { + const __m256i t = _mm256_madd_epi16(*in, *cospi); + return dct_round_shift_avx2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, + __m256i *x) { + const __m256i t0 = idct_madd_round_shift_avx2(in0, x); + const __m256i t1 = idct_madd_round_shift_avx2(in1, x); + return _mm256_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, + __m256i *out0, __m256i *out1) { + __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); + __m256i cst1 = PAIR256_SET_EPI16(c1, c0); + __m256i lo = _mm256_unpacklo_epi16(in0, in1); + __m256i hi = _mm256_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); + *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); +} + +static INLINE void idct16_16col(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[10], step2[11]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step1[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step1[7] = _mm256_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm256_add_epi16(step1[0], step1[7]); + step2[1] = _mm256_add_epi16(step1[1], step1[6]); + step2[2] = _mm256_add_epi16(step1[2], step1[5]); + step2[3] = _mm256_add_epi16(step1[3], step1[4]); + step2[4] = _mm256_sub_epi16(step1[3], step1[4]); + step2[5] = _mm256_sub_epi16(step1[2], step1[5]); + step2[6] = _mm256_sub_epi16(step1[1], step1[6]); + step2[7] = _mm256_sub_epi16(step1[0], step1[7]); + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm256_add_epi16(step2[0], step1[15]); + out[1] = _mm256_add_epi16(step2[1], step1[14]); + out[2] = _mm256_add_epi16(step2[2], step2[13]); + out[3] = _mm256_add_epi16(step2[3], step2[12]); + out[4] = _mm256_add_epi16(step2[4], step2[11]); + out[5] = _mm256_add_epi16(step2[5], step2[10]); + out[6] = _mm256_add_epi16(step2[6], step1[9]); + out[7] = _mm256_add_epi16(step2[7], step1[8]); + out[8] = _mm256_sub_epi16(step2[7], step1[8]); + out[9] = _mm256_sub_epi16(step2[6], step1[9]); + out[10] = _mm256_sub_epi16(step2[5], step2[10]); + out[11] = _mm256_sub_epi16(step2[4], step2[11]); + out[12] = _mm256_sub_epi16(step2[3], step2[12]); + out[13] = _mm256_sub_epi16(step2[2], step2[13]); + out[14] = _mm256_sub_epi16(step2[1], step1[14]); + out[15] = _mm256_sub_epi16(step2[0], step1[15]); +} + +static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { + const __m256i zero = _mm256_setzero_si256(); + __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); + d0 = _mm256_permute4x64_epi64(d0, 0xd8); + d0 = _mm256_unpacklo_epi8(d0, zero); + d0 = _mm256_add_epi16(in_x, d0); + d0 = _mm256_packus_epi16( + d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); + + _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); +} + +static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + __m256i out; + out = _mm256_adds_epi16(in, final_rounding); + out = _mm256_srai_epi16(out, 6); + recon_and_store16(dest, out); +} + +static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm256_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm256_srai_epi16(in[j], 6); + in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); + + recon_and_store16(dst, in[j]); + dst += stride; + recon_and_store16(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); + +#define LOADR(idx) \ + t[8 + (idx)] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + (idx)] = _mm256_inserti128_si256( \ + t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + __m256i in[16]; + + // Load 16x16 values + idct_load16x16(input, in, 16); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + for (i = 0; i < 16; ++i) { + write_buffer_16x1(dest + i * stride, in[i]); + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +// For each 16x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { + __m256i step1[8], step2[8]; + + // stage 3 + butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm256_add_epi16(step1[0], step1[7]); + out[1] = _mm256_add_epi16(step1[1], step1[6]); + out[2] = _mm256_add_epi16(step1[2], step1[5]); + out[3] = _mm256_add_epi16(step1[3], step1[4]); + out[4] = _mm256_sub_epi16(step1[3], step1[4]); + out[5] = _mm256_sub_epi16(step1[2], step1[5]); + out[6] = _mm256_sub_epi16(step1[1], step1[6]); + out[7] = _mm256_sub_epi16(step1[0], step1[7]); +} + +static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], + &out[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], + &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[11], step2[10]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[15], step2[14]); + + idct32_16x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[16] = _mm256_add_epi16(step1[16], step1[19]); + step2[17] = _mm256_add_epi16(step1[17], step1[18]); + step2[18] = _mm256_sub_epi16(step1[17], step1[18]); + step2[19] = _mm256_sub_epi16(step1[16], step1[19]); + step2[20] = _mm256_sub_epi16(step1[23], step1[20]); + step2[21] = _mm256_sub_epi16(step1[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[22], step1[21]); + step2[23] = _mm256_add_epi16(step1[23], step1[20]); + + step2[24] = _mm256_add_epi16(step1[24], step1[27]); + step2[25] = _mm256_add_epi16(step1[25], step1[26]); + step2[26] = _mm256_sub_epi16(step1[25], step1[26]); + step2[27] = _mm256_sub_epi16(step1[24], step1[27]); + step2[28] = _mm256_sub_epi16(step1[31], step1[28]); + step2[29] = _mm256_sub_epi16(step1[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step1[30]); + step2[31] = _mm256_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm256_add_epi16(step1[16], step1[23]); + out[17] = _mm256_add_epi16(step1[17], step1[22]); + out[18] = _mm256_add_epi16(step1[18], step1[21]); + out[19] = _mm256_add_epi16(step1[19], step1[20]); + step2[20] = _mm256_sub_epi16(step1[19], step1[20]); + step2[21] = _mm256_sub_epi16(step1[18], step1[21]); + step2[22] = _mm256_sub_epi16(step1[17], step1[22]); + step2[23] = _mm256_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm256_sub_epi16(step1[31], step1[24]); + step2[25] = _mm256_sub_epi16(step1[30], step1[25]); + step2[26] = _mm256_sub_epi16(step1[29], step1[26]); + step2[27] = _mm256_sub_epi16(step1[28], step1[27]); + out[28] = _mm256_add_epi16(step1[27], step1[28]); + out[29] = _mm256_add_epi16(step1[26], step1[29]); + out[30] = _mm256_add_epi16(step1[25], step1[30]); + out[31] = _mm256_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], + &out[27]); + butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], + &out[26]); + butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], + &out[25]); + butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], + &out[24]); +} + +static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { + __m256i temp[16]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + idct32_1024_16x32_quarter_1(in, temp); + + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_2(in, temp); + + // stage 7 + add_sub_butterfly_avx2(temp, out, 16); +} + +// For each 16x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { + __m256i step1[32], step2[32]; + + // stage 1 + butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm256_add_epi16(step1[16], step1[17]); + step2[17] = _mm256_sub_epi16(step1[16], step1[17]); + step2[18] = _mm256_sub_epi16(step1[19], step1[18]); + step2[19] = _mm256_add_epi16(step1[19], step1[18]); + step2[20] = _mm256_add_epi16(step1[20], step1[21]); + step2[21] = _mm256_sub_epi16(step1[20], step1[21]); + step2[22] = _mm256_sub_epi16(step1[23], step1[22]); + step2[23] = _mm256_add_epi16(step1[23], step1[22]); + + step2[24] = _mm256_add_epi16(step1[24], step1[25]); + step2[25] = _mm256_sub_epi16(step1[24], step1[25]); + step2[26] = _mm256_sub_epi16(step1[27], step1[26]); + step2[27] = _mm256_add_epi16(step1[27], step1[26]); + step2[28] = _mm256_add_epi16(step1[28], step1[29]); + step2[29] = _mm256_sub_epi16(step1[28], step1[29]); + step2[30] = _mm256_sub_epi16(step1[31], step1[30]); + step2[31] = _mm256_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { + __m256i temp[32]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + // AND + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_1_2(in, temp); + + // For each 16x32 block __m256i in[32], + // Input with odd index, + // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + // output pixels: 16-23, 24-31 in __m256i out[32] + idct32_1024_16x32_quarter_3_4(in, temp); + + // final stage + add_sub_butterfly_avx2(temp, out, 32); +} + +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i l[32], r[32], out[32], *in; + int i; + + in = l; + + for (i = 0; i < 2; i++) { + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + + idct_load16x16(input + 16, in + 16, 32); + transpose_16bit_16x16_avx2(in + 16, in + 16); + idct32_1024_16x32(in, in); + + in = r; + input += 32 << 4; + } + + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(l + i, out); + transpose_16bit_16x16_avx2(r + i, out + 16); + idct32_1024_16x32(out, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} + +// Case when only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32], io[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm256_setzero_si256(); + } + + // rows + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + idct32_1024_16x32(in, io); + + // columns + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(io + i, in); + idct32_1024_16x32(in, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 7d8352721..5ff5abc11 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -19,17 +19,18 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -38,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - *eob_ptr = 0; - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -140,17 +138,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -159,27 +155,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - shift = _mm_slli_epi16(shift, 1); + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 28f7c9c7d..d4872f6bc 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,13 +13,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( - const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, - __m256i *round, const int16_t *quant_ptr, __m256i *quant, - const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, __m256i *shift, int log_scale) { - *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -30,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2( // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); - *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); *round = _mm256_permute4x64_epi64(*round, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -38,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2( *round = _mm256_srai_epi16(*round, log_scale); } - *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); - *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_shift)); *shift = _mm256_permute4x64_epi64(*shift, 0x54); } @@ -151,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { } void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 0); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 0); // Do DC and first 15 AC. v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); @@ -250,23 +252,18 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)n_coeffs; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 1); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 9533e7916..64838eaa7 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -16,16 +16,16 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; @@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - // Setup global values. - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e41..82c755a0c 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,26 +15,53 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" -static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, - const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_b_values(const struct macroblock_plane *const mb_plane, + __m128i *zbin, __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant, - const int16_t *shift_ptr, __m128i *shift) { - *zbin = _mm_load_si128((const __m128i *)zbin_ptr); - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); - *shift = _mm_load_si128((const __m128i *)shift_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); } -static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + +static INLINE void load_fp_values(const struct macroblock_plane *mb_plane, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant) { - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + *round = _mm_load_si128((const __m128i *)mb_plane->round_fp); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); } diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 476230286..2c6d851a1 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,16 +16,17 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -33,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -107,17 +105,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -126,30 +121,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 399b67b3f..cf7111983 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 32; i++) { + for (i = 0; i < h; i++) { __m256i r[4]; // load src and all ref[] @@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; @@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 64; i++) { + for (i = 0; i < h; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); @@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm index 3f6e55ce9..ed4ea3ef9 100644 --- a/vpx_dsp/x86/sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -179,7 +179,16 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mov ref1q, [ref1q+gprsize*0] PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 %if %1 > 4 @@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ punpckhqdq m5, m7 movifnidn r4, r4mp paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif movu [r4], m4 RET %else movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif movq [r4+0], m6 movq [r4+8], m7 RET @@ -237,3 +264,15 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index 29bedb0e6..e00494d76 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,73 +11,104 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD32_H(h) \ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } -#define FSAD64 \ - FSAD64_H(64) \ - FSAD64_H(32) +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) -#define FSAD32 \ - FSAD32_H(64) \ - FSAD32_H(32) \ - FSAD32_H(16) +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) FSAD64 FSAD32 @@ -86,6 +117,8 @@ FSAD32 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H #define FSADAVG64_H(h) \ unsigned int vpx_sad64x##h##_avg_avx2( \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index e4e1bc3e9..627e463bf 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -12,15 +12,29 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 -%if %4 == 0 +%if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..917ff0ef1 --- /dev/null +++ b/vpx_dsp/x86/sse_avx2.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> +#include <immintrin.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); + const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); + const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + const uint8_t *a2; + const uint8_t *b2; + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); + const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); + const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); + const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + const uint16_t *a2; + const uint16_t *b2; + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..4a7585c57 --- /dev/null +++ b/vpx_dsp/x86/sse_sse4.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm index 4273efb85..e3055ab29 100644 --- a/vpx_dsp/x86/subtract_sse2.asm +++ b/vpx_dsp/x86/subtract_sse2.asm @@ -124,4 +124,5 @@ INIT_MMX lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_4 + emms RET diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 35925d590..8305b9f20 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { return _mm256_add_epi32(sum_lo, sum_hi); } +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + static INLINE void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { @@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src, variance_kernel_avx2(s, r, sse, sum); } +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, @@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index c7d880860..526c28382 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" #include "vpx_ports/mem.h" // filters for 16_h8 @@ -38,6 +39,27 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +#define CALC_CONVOLVE8_HORZ_ROW \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ + s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ + s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ + s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ + s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \ + s1[0] = convolve8_16_avx2(s1, f1); \ + s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \ + src_ptr += src_stride; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \ + output_ptr += output_pitch; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], \ + _mm256_extractf128_si256(s1[0], 1)); \ + output_ptr += output_pitch; + static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, @@ -61,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( __m256i srcReg; // load the 2 strides of source - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -77,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -97,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( src_ptr += src_stride; - // average if necessary - outReg1 = _mm256_castsi256_si128(outReg32b1); - outReg2 = _mm256_extractf128_si256(outReg32b1, 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); - + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg; - - // load the first 16 bytes of the last row - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg1 = convolve8_8_avx2(s, f); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); - // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg2 = convolve8_8_avx2(s, f); + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively + // shrink to 8 bit each 16 bits outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary @@ -177,11 +166,63 @@ static void vpx_filter_block1d16_h8_avg_avx2( output_height, filter, 1); } +static void vpx_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m256i filt[4], f1[4], s1[4], srcReg; + __m128i f[4], s[4]; + int y = output_height; + + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + shuffle_filter_avx2(filter, f1); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // Process next 4 rows + while (y > 3) { + CALC_CONVOLVE8_HORZ_ROW + CALC_CONVOLVE8_HORZ_ROW + y -= 4; + } + + // If remaining, then process 2 rows at a time + while (y > 1) { + CALC_CONVOLVE8_HORZ_ROW + y -= 2; + } + + // For the remaining height. + if (y > 0) { + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + f[0] = _mm256_castsi256_si128(f1[0]); + f[1] = _mm256_castsi256_si128(f1[1]); + f[2] = _mm256_castsi256_si128(f1[2]); + f[3] = _mm256_castsi256_si128(f1[3]); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); + s[0] = convolve8_8_ssse3(s, f); + + // Saturate 16bit value to 8bit. + s[0] = _mm_packus_epi16(s[0], s[0]); + + // Save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + } +} + static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i outReg1, outReg2; __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; @@ -260,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(s1[0]); - outReg2 = _mm256_extractf128_si256(s1[0], 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); } - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); output_ptr += dst_stride; @@ -534,9 +570,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_dst_stride = dst_stride << 1; int h; - __m256i src_reg, src_reg_shift_0, src_reg_shift_2; - __m256i dst_reg; - __m256i tmp_0, tmp_1; __m256i idx_shift_0 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); @@ -557,9 +590,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, for (h = height; h >= 2; h -= 2) { // Load the source - src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); - src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); - src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + __m256i dst_reg; + __m256i tmp_0, tmp_1; + const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); // Get the output tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); @@ -580,9 +615,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, // Repeat for the last row if needed if (h > 0) { - __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); __m128i dst_reg; - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i tmp_0, tmp_1; __m128i src_reg_shift_0 = @@ -596,7 +631,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, _mm256_castsi256_si128(kernel_reg_45)); dst_reg = _mm_adds_epi16(tmp_0, tmp_1); - dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32_128, 6); dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); @@ -715,8 +750,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_src_stride = src_stride << 1; const ptrdiff_t unrolled_dst_stride = dst_stride << 1; - __m256i src_reg, src_reg_shuf; - __m256i dst; __m256i shuf_idx = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); @@ -733,12 +766,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, for (h = height; h > 1; h -= 2) { // Load the source - src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr, - (const __m128i *)(src_ptr + src_stride)); - src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + const __m256i src_reg = mm256_loadu2_epi64( + (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride)); + const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); // Get the result - dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); // Round result @@ -757,7 +790,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, if (h > 0) { // Load the source - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); __m128i src_reg_shuf = _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); @@ -768,7 +801,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); // Round result - dst = mm_round_epi16_sse2(&dst, ®_32, 6); + dst = mm_round_epi16_sse2(&dst, ®_32_128, 6); // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); @@ -866,22 +899,399 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, } } +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + +static void vpx_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64_256bit; + unsigned int y = output_height; + + assert(output_height > 1); + + addFilterReg64_256bit = _mm256_set1_epi16(32); + + // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit) + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each) + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + { + ptrdiff_t src_stride; + __m256i filt1Reg, filt2Reg, firstFilters, secondFilters; + // have the same data in both lanes of a 256 bit register + // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0 + // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each) + const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); + + // duplicate only the first 32 bits + // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1 + // f0|f3 f2 f1 f0|f3 f2 f1 f0 + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5 + // f4|f7 f6 f5 f4|f7 f6 f5 f4 + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s2 s4 s3 s2 s1 s3 s2 s1 s0 + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + + // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7 + // s6 s8 s7 s6 s5 s7 s6 s5 s4 + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + + do { + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1; + // load the 2 strides of source + // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 + // r06 r05 r04 r03 r02 r01 r00 + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); + + // filter the source buffer + // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 + // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00 + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||... + // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010 + // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010 + // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = + _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit) + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + + // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit) + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + // save first row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1)); + output_ptr += output_pitch; + + // save second row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += output_pitch; + + y = y - 2; + } while (y > 1); + + // For remaining height + if (y > 0) { + __m128i srcReg1, srcRegFilt1_1, addFilterReg64; + __m128i srcRegFilt2; + + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = + _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } + } +} + +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[9], rr[2]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + s[5] = s[9]; + + r[4] = r[8]; + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4], r1[4]; + + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + } +} + #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 -#else // VPX_ARCH_X86 +#else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; @@ -897,7 +1307,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3; filter8_1dfunction vpx_filter_block1d8_h2_ssse3; filter8_1dfunction vpx_filter_block1d4_v2_ssse3; filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c new file mode 100644 index 000000000..639f4ff8e --- /dev/null +++ b/vpx_ports/aarch32_cpudetect.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +// Feature detection code for Armv7-A / AArch32. + +#include "./vpx_config.h" +#include "arm_cpudetect.h" + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + // MSVC has no inline __asm support for Arm, but it does let you __emit + // instructions via their assembled hex code. + // All of these instructions should be essentially nops. + __try { + // VORR q0,q0,q0 + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + // Ignore exception. + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + uint64_t features = android_getCpuFeatures(); + if (features & ANDROID_CPU_ARM_FEATURE_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) + +#include <sys/auxv.h> + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH32_HWCAP_NEON (1 << 12) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON || HAVE_NEON_ASM + if (hwcap & VPX_AARCH32_HWCAP_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} +#else // end __linux__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (arm_cpu_env_flags(&flags)) { + return flags; + } + return arm_get_cpu_caps() & arm_cpu_env_mask(); +} diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c new file mode 100644 index 000000000..539d09bb3 --- /dev/null +++ b/vpx_ports/aarch64_cpudetect.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "arm_cpudetect.h" + +#if defined(__APPLE__) +#include <sys/sysctl.h> +#endif + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT + +// sysctlbyname() parameter documentation for instruction set characteristics: +// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics +static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; + } + return feature_present; +} + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (have_feature("hw.optional.arm.FEAT_DotProd")) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (have_feature("hw.optional.arm.FEAT_I8MM")) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(_WIN32) // end __APPLE__ + +static int arm_get_cpu_caps(void) { + int flags = 0; +// IsProcessorFeaturePresent() parameter documentation: +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 20348, supported by Windows 11 and Windows Server 2022. +#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_DOTPROD; + } +#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_DOTPROD + // No I8MM or SVE feature detection available on Windows at time of writing. + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES) + +#include <sys/auxv.h> + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define VPX_AARCH64_HWCAP_SVE (1 << 22) +#define VPX_AARCH64_HWCAP2_I8MM (1 << 13) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON_I8MM + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif // HAVE_NEON_I8MM +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (hwcap & VPX_AARCH64_HWCAP_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE + return flags; +} + +#elif defined(__Fuchsia__) // end __linux__ + +#include <zircon/features.h> +#include <zircon/syscalls.h> + +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. +#ifndef ZX_ARM64_FEATURE_ISA_I8MM +#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) +#endif +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. +#ifndef ZX_ARM64_FEATURE_ISA_SVE +#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) +#endif + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + uint32_t features; + zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) { + return flags; + } +#if HAVE_NEON_DOTPROD + if (features & ZX_ARM64_FEATURE_ISA_DP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (features & ZX_ARM64_FEATURE_ISA_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (features & ZX_ARM64_FEATURE_ISA_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE + return flags; +} + +#else // end __Fuchsia__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (!arm_cpu_env_flags(&flags)) { + flags = arm_get_cpu_caps() & arm_cpu_env_mask(); + } + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_NEON_I8MM; + } + + // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_SVE; + } + if (!(flags & HAS_NEON_I8MM)) { + flags &= ~HAS_SVE; + } + + return flags; +} diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h index 6458a2c5b..39365d18e 100644 --- a/vpx_ports/arm.h +++ b/vpx_ports/arm.h @@ -17,12 +17,14 @@ extern "C" { #endif -/*ARMv5TE "Enhanced DSP" instructions.*/ -#define HAS_EDSP 0x01 -/*ARMv6 "Parallel" or "Media" instructions.*/ -#define HAS_MEDIA 0x02 -/*ARMv7 optional NEON instructions.*/ -#define HAS_NEON 0x04 +// Armv7-A optional Neon instructions, mandatory from Armv8.0-A. +#define HAS_NEON (1 << 0) +// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. +#define HAS_NEON_DOTPROD (1 << 1) +// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. +#define HAS_NEON_I8MM (1 << 2) +// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. +#define HAS_SVE (1 << 3) int arm_cpu_caps(void); diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c deleted file mode 100644 index 4f9d480ad..000000000 --- a/vpx_ports/arm_cpudetect.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> -#include <string.h> - -#include "./vpx_config.h" -#include "vpx_ports/arm.h" - -#ifdef WINAPI_FAMILY -#include <winapifamily.h> -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define getenv(x) NULL -#endif -#endif - -static int arm_cpu_env_flags(int *flags) { - char *env; - env = getenv("VPX_SIMD_CAPS"); - if (env && *env) { - *flags = (int)strtol(env, NULL, 0); - return 0; - } - *flags = 0; - return -1; -} - -static int arm_cpu_env_mask(void) { - char *env; - env = getenv("VPX_SIMD_CAPS_MASK"); - return env && *env ? (int)strtol(env, NULL, 0) : ~0; -} - -#if !CONFIG_RUNTIME_CPU_DETECT - -int arm_cpu_caps(void) { - /* This function should actually be a no-op. There is no way to adjust any of - * these because the RTCD tables do not exist: the functions are called - * statically */ - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -#if HAVE_NEON || HAVE_NEON_ASM - flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ -/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#ifndef WIN32_EXTRA_LEAN -#define WIN32_EXTRA_LEAN -#endif -#include <windows.h> - -int arm_cpu_caps(void) { - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -/* MSVC has no inline __asm support for ARM, but it does let you __emit - * instructions via their assembled hex code. - * All of these instructions should be essentially nops. - */ -#if HAVE_NEON || HAVE_NEON_ASM - if (mask & HAS_NEON) { - __try { - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags |= HAS_NEON; - } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { - /*Ignore exception.*/ - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__ANDROID__) /* end _MSC_VER */ -#include <cpu-features.h> - -int arm_cpu_caps(void) { - int flags; - int mask; - uint64_t features; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - features = android_getCpuFeatures(); - -#if HAVE_NEON || HAVE_NEON_ASM - if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__linux__) /* end __ANDROID__ */ - -#include <stdio.h> - -int arm_cpu_caps(void) { - FILE *fin; - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - /* Reading /proc/self/auxv would be easier, but that doesn't work reliably - * on Android. - * This also means that detection will fail in Scratchbox. - */ - fin = fopen("/proc/cpuinfo", "r"); - if (fin != NULL) { - /* 512 should be enough for anybody (it's even enough for all the flags - * that x86 has accumulated... so far). - */ - char buf[512]; - while (fgets(buf, 511, fin) != NULL) { -#if HAVE_NEON || HAVE_NEON_ASM - if (memcmp(buf, "Features", 8) == 0) { - char *p; - p = strstr(buf, " neon"); - if (p != NULL && (p[5] == ' ' || p[5] == '\n')) { - flags |= HAS_NEON; - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - } - fclose(fin); - } - return flags & mask; -} -#else /* end __linux__ */ -#error \ - "--enable-runtime-cpu-detect selected, but no CPU detection method " \ -"available for your platform. Reconfigure with --disable-runtime-cpu-detect." -#endif diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h new file mode 100644 index 000000000..881397abc --- /dev/null +++ b/vpx_ports/arm_cpudetect.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> + +#include "vpx_config.h" +#include "vpx_ports/arm.h" + +#if defined(_WIN32) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#undef WIN32_EXTRA_LEAN +#define WIN32_EXTRA_LEAN +#include <windows.h> +#endif + +#ifdef WINAPI_FAMILY +#include <winapifamily.h> +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +#if defined(__ANDROID__) && (__ANDROID_API__ < 18) +#define ANDROID_USE_CPU_FEATURES_LIB 1 +// Use getauxval() when targeting (64-bit) Android with API level >= 18. +// getauxval() is supported since Android API level 18 (Android 4.3.) +// First Android version with 64-bit support was Android 5.x (API level 21). +#include <cpu-features.h> +#endif + +static INLINE int arm_cpu_env_flags(int *flags) { + const char *env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 1; + } + return 0; +} + +static INLINE int arm_cpu_env_mask(void) { + const char *env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk index e30e87cef..93279dbeb 100644 --- a/vpx_ports/vpx_ports.mk +++ b/vpx_ports/vpx_ports.mk @@ -36,7 +36,12 @@ PORTS_SRCS-yes += x86.h PORTS_SRCS-yes += x86_abi_support.asm endif -PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c +ifeq ($(VPX_ARCH_AARCH64),yes) +PORTS_SRCS-yes += aarch64_cpudetect.c +else +PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c +endif +PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c @@ -446,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len, case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; - default: die("Unrecognized pattern %%%c\n", p[1]); break; + default: die("Unrecognized pattern %%%c\n", p[1]); } pat_len = strlen(q); @@ -996,7 +996,7 @@ static int main_loop(int argc, const char **argv_) { if (single_file) { if (use_y4m) { - char buf[Y4M_BUFFER_SIZE] = { 0 }; + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; size_t len = 0; if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) { fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n"); @@ -1005,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) { if (frame_out == 1) { // Y4M file header len = y4m_write_file_header( - buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height, - &vpx_input_ctx.framerate, img->fmt, img->bit_depth); + y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width, + vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt, + img->bit_depth); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } // Y4M frame header - len = y4m_write_frame_header(buf, sizeof(buf)); + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } else { if (frame_out == 1) { @@ -883,7 +883,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, /* Default lag_in_frames is 0 in realtime mode CBR mode*/ if (global->deadline == VPX_DL_REALTIME && - stream->config.cfg.rc_end_usage == 1) + stream->config.cfg.rc_end_usage == VPX_CBR) stream->config.cfg.g_lag_in_frames = 0; } @@ -1586,13 +1586,14 @@ static void test_decode(struct stream_state *stream, /* Get the internal reference frame */ if (strcmp(codec->name, "vp8") == 0) { struct vpx_ref_frame ref_enc, ref_dec; - int width, height; + int aligned_width = (stream->config.cfg.g_w + 15) & ~15; + int aligned_height = (stream->config.cfg.g_h + 15) & ~15; - width = (stream->config.cfg.g_w + 15) & ~15; - height = (stream->config.cfg.g_h + 15) & ~15; - vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); enc_img = ref_enc.img; - vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); dec_img = ref_dec.img; ref_enc.frame_type = VP8_LAST_FRAME; @@ -1969,10 +1970,9 @@ int main(int argc, const char **argv_) { } else { const int64_t input_pos = ftello(input.file); const int64_t input_pos_lagged = input_pos - lagged_count; - const int64_t limit = input.length; rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; - remaining = limit - input_pos + lagged_count; + remaining = input.length - input_pos + lagged_count; } average_rate = diff --git a/y4minput.c b/y4minput.c index 745e2f1cd..210ce52fc 100644 --- a/y4minput.c +++ b/y4minput.c @@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { _img->fmt = _y4m->vpx_fmt; _img->w = _img->d_w = _y4m->pic_w; _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; _img->bps = _y4m->bps; |