summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjohannkoenig@chromium.org <johannkoenig@chromium.org>2014-09-08 18:46:28 +0000
committerjohannkoenig@chromium.org <johannkoenig@chromium.org>2014-09-08 18:46:28 +0000
commitd95585fb0ec024f6abd96f7b02e0df58019d46af (patch)
tree099198c9fe84d7f873666002a1d5c63421785899
parent0f393e92b0e220eeaa6acb0ad31e696fa5c67ccd (diff)
downloadlibvpx-d95585fb0ec024f6abd96f7b02e0df58019d46af.tar.gz
libvpx: Pull from upstream
Current HEAD: c731d6a4f19eea861ceb2ff31399420b2452eb74 git log from upstream: 395f2e8 vp8 encoder: remove vp8_yv12_copy_partial_frame_neon 980abf6 Fixing Mac OS build. fcd431f libyuv: cherry-pick MSVC arm build fix 1f19ebb Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics. 1dd9a63 Correct the mode decisions in special cases 1100e26 Removing postproc mmx code. c97f5e8 vp8 common: change 'HAVE_NEON_ASM' to 'HAVE_NEON' for compiling functions of NEON intrinsics. a808344 fix x86-darwin* build 35fadf1 bilinearpredict_neon: fix type conversion warnings bb4950d vp9: correct context buffer resize check 440f509 vp9: fail decode if block/frame refs are corrupt dbdb87b Fix a visual studio warning d435148 Enable adaptive motion search for ARF coding b1153f3 Map motion magnitude in VP9 denoiser. 7897059 Adding temp cpi var. 91998e6 Removing sz member from vpx_codec_priv. d75266f Update the condition when COPY_BLOCK is chosen. 4909435 Removing unused function prototypes. 202edb3 Actually resetting random generator for all variance test cases. e30f769 Fix a bug in VP9 denoiser. ec94967 Revert "Revert "VP8 for ARMv8 by using NEON intrinsics 10"" a51704d vp8 common: change 'HAVE_NEON_ASM' to 'HAVE_NEON' for compiling idct_blk_neon.c. 0002da3 arm: Fix building vp8_subpixelvariance_neon.c with MSVC 48197f0 Adding sse2 variant for vp9_mse{8x8, 8x16, 16x8}. <...> TBR=tomfinegan@chromium.org Review URL: https://codereview.chromium.org/554673004 git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libvpx@291859 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
-rw-r--r--README.chromium4
-rw-r--r--libvpx_srcs.gni50
-rw-r--r--libvpx_srcs_arm64.gypi8
-rw-r--r--libvpx_srcs_arm_neon.gypi17
-rw-r--r--libvpx_srcs_arm_neon_cpu_detect.gypi10
-rw-r--r--libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi7
-rw-r--r--libvpx_srcs_x86.gypi3
-rw-r--r--libvpx_srcs_x86_64.gypi3
-rw-r--r--libvpx_srcs_x86_64_intrinsics.gypi1
-rw-r--r--libvpx_srcs_x86_intrinsics.gypi1
-rw-r--r--source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h16
-rw-r--r--source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h9
-rw-r--r--source/config/linux/arm-neon-cpu-detect/vpx_config.asm1
-rw-r--r--source/config/linux/arm-neon-cpu-detect/vpx_config.h1
-rw-r--r--source/config/linux/arm-neon/vp8_rtcd.h10
-rw-r--r--source/config/linux/arm-neon/vp9_rtcd.h9
-rw-r--r--source/config/linux/arm-neon/vpx_config.asm1
-rw-r--r--source/config/linux/arm-neon/vpx_config.h1
-rw-r--r--source/config/linux/arm/vp8_rtcd.h3
-rw-r--r--source/config/linux/arm/vp9_rtcd.h9
-rw-r--r--source/config/linux/arm/vpx_config.asm1
-rw-r--r--source/config/linux/arm/vpx_config.h1
-rw-r--r--source/config/linux/arm64/vp8_rtcd.h33
-rw-r--r--source/config/linux/arm64/vp9_rtcd.h9
-rw-r--r--source/config/linux/arm64/vpx_config.asm1
-rw-r--r--source/config/linux/arm64/vpx_config.h1
-rw-r--r--source/config/linux/generic/vp8_rtcd.h3
-rw-r--r--source/config/linux/generic/vp9_rtcd.h9
-rw-r--r--source/config/linux/generic/vpx_config.asm1
-rw-r--r--source/config/linux/generic/vpx_config.h1
-rw-r--r--source/config/linux/ia32/vp8_rtcd.h3
-rw-r--r--source/config/linux/ia32/vp9_rtcd.h50
-rw-r--r--source/config/linux/ia32/vpx_config.asm1
-rw-r--r--source/config/linux/ia32/vpx_config.h1
-rw-r--r--source/config/linux/mips64el/vp8_rtcd.h3
-rw-r--r--source/config/linux/mips64el/vp9_rtcd.h9
-rw-r--r--source/config/linux/mips64el/vpx_config.h1
-rw-r--r--source/config/linux/mipsel/vp8_rtcd.h3
-rw-r--r--source/config/linux/mipsel/vp9_rtcd.h9
-rw-r--r--source/config/linux/mipsel/vpx_config.h1
-rw-r--r--source/config/linux/x64/vp8_rtcd.h3
-rw-r--r--source/config/linux/x64/vp9_rtcd.h31
-rw-r--r--source/config/linux/x64/vpx_config.asm1
-rw-r--r--source/config/linux/x64/vpx_config.h1
-rw-r--r--source/config/mac/ia32/vp8_rtcd.h3
-rw-r--r--source/config/mac/ia32/vp9_rtcd.h76
-rw-r--r--source/config/mac/ia32/vpx_config.asm1
-rw-r--r--source/config/mac/ia32/vpx_config.h1
-rw-r--r--source/config/mac/x64/vp8_rtcd.h3
-rw-r--r--source/config/mac/x64/vp9_rtcd.h31
-rw-r--r--source/config/mac/x64/vpx_config.asm1
-rw-r--r--source/config/mac/x64/vpx_config.h1
-rw-r--r--source/config/nacl/vp8_rtcd.h3
-rw-r--r--source/config/nacl/vp9_rtcd.h9
-rw-r--r--source/config/nacl/vpx_config.asm1
-rw-r--r--source/config/nacl/vpx_config.h1
-rw-r--r--source/config/win/ia32/vp8_rtcd.h3
-rw-r--r--source/config/win/ia32/vp9_rtcd.h50
-rw-r--r--source/config/win/ia32/vpx_config.asm1
-rw-r--r--source/config/win/ia32/vpx_config.h1
-rw-r--r--source/config/win/x64/vp8_rtcd.h3
-rw-r--r--source/config/win/x64/vp9_rtcd.h31
-rw-r--r--source/config/win/x64/vpx_config.asm1
-rw-r--r--source/config/win/x64/vpx_config.h1
-rwxr-xr-xsource/libvpx/build/make/gen_msvs_proj.sh8
-rwxr-xr-xsource/libvpx/build/make/gen_msvs_vcxproj.sh8
-rwxr-xr-xsource/libvpx/configure2
-rw-r--r--source/libvpx/examples.mk3
-rw-r--r--source/libvpx/examples/set_maps.c6
-rw-r--r--source/libvpx/examples/twopass_encoder.c201
-rw-r--r--source/libvpx/examples/vp8_multi_resolution_encoder.c675
-rw-r--r--source/libvpx/examples/vp9_spatial_svc_encoder.c17
-rw-r--r--source/libvpx/examples/vpx_temporal_svc_encoder.c16
-rw-r--r--source/libvpx/libs.mk2
-rw-r--r--source/libvpx/test/active_map_test.cc4
-rw-r--r--source/libvpx/test/datarate_test.cc35
-rw-r--r--source/libvpx/test/dct16x16_test.cc6
-rw-r--r--source/libvpx/test/dct32x32_test.cc6
-rw-r--r--source/libvpx/test/decode_perf_test.cc2
-rw-r--r--source/libvpx/test/decode_test_driver.cc2
-rw-r--r--source/libvpx/test/decode_test_driver.h10
-rw-r--r--source/libvpx/test/encode_test_driver.cc6
-rw-r--r--source/libvpx/test/encode_test_driver.h19
-rw-r--r--source/libvpx/test/external_frame_buffer_test.cc2
-rw-r--r--source/libvpx/test/fdct4x4_test.cc4
-rw-r--r--source/libvpx/test/fdct8x8_test.cc2
-rw-r--r--source/libvpx/test/frame_size_tests.cc2
-rw-r--r--source/libvpx/test/intrapred_test.cc10
-rw-r--r--source/libvpx/test/invalid_file_test.cc17
-rw-r--r--source/libvpx/test/md5_helper.h3
-rw-r--r--source/libvpx/test/resize_test.cc4
-rw-r--r--source/libvpx/test/sad_test.cc15
-rw-r--r--source/libvpx/test/svc_test.cc371
-rw-r--r--source/libvpx/test/test-data.sha120
-rw-r--r--source/libvpx/test/test.mk20
-rw-r--r--source/libvpx/test/test_vectors.cc3
-rw-r--r--source/libvpx/test/tile_independence_test.cc2
-rw-r--r--source/libvpx/test/user_priv_test.cc2
-rw-r--r--source/libvpx/test/variance_test.cc100
-rw-r--r--source/libvpx/test/vp8_decrypt_test.cc2
-rwxr-xr-xsource/libvpx/test/vp8_multi_resolution_encoder.sh75
-rw-r--r--source/libvpx/test/vp9_decrypt_test.cc2
-rw-r--r--source/libvpx/test/vp9_thread_test.cc2
-rwxr-xr-xsource/libvpx/test/vpxenc.sh164
-rw-r--r--source/libvpx/test/y4m_test.cc2
-rw-r--r--source/libvpx/third_party/libyuv/README.libvpx4
-rw-r--r--source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h1
-rw-r--r--source/libvpx/third_party/libyuv/include/libyuv/row.h90
-rw-r--r--source/libvpx/third_party/libyuv/include/libyuv/scale_row.h8
-rw-r--r--source/libvpx/third_party/libyuv/include/libyuv/version.h2
-rw-r--r--source/libvpx/third_party/libyuv/source/compare.cc2
-rw-r--r--source/libvpx/third_party/libyuv/source/compare_neon.cc39
-rw-r--r--source/libvpx/third_party/libyuv/source/convert.cc306
-rw-r--r--source/libvpx/third_party/libyuv/source/convert_from_argb.cc168
-rw-r--r--source/libvpx/third_party/libyuv/source/cpu_id.cc14
-rw-r--r--source/libvpx/third_party/libyuv/source/format_conversion.cc12
-rw-r--r--source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc10
-rw-r--r--source/libvpx/third_party/libyuv/source/row_any.cc24
-rw-r--r--source/libvpx/third_party/libyuv/source/row_neon64.cc437
-rw-r--r--source/libvpx/third_party/libyuv/source/row_win.cc7
-rw-r--r--source/libvpx/third_party/libyuv/source/scale_neon64.cc790
-rw-r--r--source/libvpx/tools_common.c5
-rw-r--r--source/libvpx/tools_common.h16
-rw-r--r--source/libvpx/vp8/common/arm/loopfilter_arm.c8
-rw-r--r--source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c14
-rw-r--r--source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm595
-rw-r--r--source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm81
-rw-r--r--source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c62
-rw-r--r--source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm199
-rw-r--r--source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c185
-rw-r--r--source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm409
-rw-r--r--source/libvpx/vp8/common/arm/neon/loopfilter_neon.c549
-rw-r--r--source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm156
-rw-r--r--source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c279
-rw-r--r--source/libvpx/vp8/common/arm/neon/reconintra_neon.c210
-rw-r--r--source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm425
-rw-r--r--source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm583
-rw-r--r--source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm225
-rw-r--r--source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c1028
-rw-r--r--source/libvpx/vp8/common/arm/reconintra_arm.c58
-rw-r--r--source/libvpx/vp8/common/arm/variance_arm.c2
-rw-r--r--source/libvpx/vp8/common/onyx.h2
-rw-r--r--source/libvpx/vp8/common/rtcd_defs.pl39
-rw-r--r--source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c46
-rw-r--r--source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm221
-rw-r--r--source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c269
-rw-r--r--source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm72
-rw-r--r--source/libvpx/vp8/encoder/denoising.c6
-rw-r--r--source/libvpx/vp8/encoder/onyx_if.c2
-rw-r--r--source/libvpx/vp8/encoder/pickinter.c10
-rw-r--r--source/libvpx/vp8/encoder/picklpf.c10
-rw-r--r--source/libvpx/vp8/vp8_common.mk19
-rw-r--r--source/libvpx/vp8/vp8_cx_iface.c100
-rw-r--r--source/libvpx/vp8/vp8_dx_iface.c66
-rw-r--r--source/libvpx/vp8/vp8cx_arm.mk4
-rw-r--r--source/libvpx/vp9/common/vp9_alloccommon.c9
-rw-r--r--source/libvpx/vp9/common/vp9_common.h5
-rw-r--r--source/libvpx/vp9/common/vp9_enums.h6
-rw-r--r--source/libvpx/vp9/common/vp9_mv.h8
-rw-r--r--source/libvpx/vp9/common/vp9_onyxc_int.h8
-rw-r--r--source/libvpx/vp9/common/vp9_postproc.c3
-rw-r--r--source/libvpx/vp9/common/vp9_reconintra.c58
-rw-r--r--source/libvpx/vp9/common/vp9_reconintra.h2
-rw-r--r--source/libvpx/vp9/common/vp9_rtcd_defs.pl47
-rw-r--r--source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm533
-rw-r--r--source/libvpx/vp9/decoder/vp9_decodeframe.c52
-rw-r--r--source/libvpx/vp9/decoder/vp9_decoder.c12
-rw-r--r--source/libvpx/vp9/decoder/vp9_dthread.c46
-rw-r--r--source/libvpx/vp9/decoder/vp9_dthread.h3
-rw-r--r--source/libvpx/vp9/encoder/vp9_bitstream.c24
-rw-r--r--source/libvpx/vp9/encoder/vp9_bitstream.h2
-rw-r--r--source/libvpx/vp9/encoder/vp9_block.h8
-rw-r--r--source/libvpx/vp9/encoder/vp9_context_tree.h5
-rw-r--r--source/libvpx/vp9/encoder/vp9_denoiser.c62
-rw-r--r--source/libvpx/vp9/encoder/vp9_denoiser.h10
-rw-r--r--source/libvpx/vp9/encoder/vp9_encodeframe.c54
-rw-r--r--source/libvpx/vp9/encoder/vp9_encodemb.c30
-rw-r--r--source/libvpx/vp9/encoder/vp9_encodemv.c2
-rw-r--r--source/libvpx/vp9/encoder/vp9_encoder.c433
-rw-r--r--source/libvpx/vp9/encoder/vp9_encoder.h43
-rw-r--r--source/libvpx/vp9/encoder/vp9_firstpass.c196
-rw-r--r--source/libvpx/vp9/encoder/vp9_firstpass.h2
-rw-r--r--source/libvpx/vp9/encoder/vp9_lookahead.c6
-rw-r--r--source/libvpx/vp9/encoder/vp9_lookahead.h3
-rw-r--r--source/libvpx/vp9/encoder/vp9_mbgraph.c12
-rw-r--r--source/libvpx/vp9/encoder/vp9_mcomp.c283
-rw-r--r--source/libvpx/vp9/encoder/vp9_mcomp.h5
-rw-r--r--source/libvpx/vp9/encoder/vp9_pickmode.c16
-rw-r--r--source/libvpx/vp9/encoder/vp9_ratectrl.c4
-rw-r--r--source/libvpx/vp9/encoder/vp9_rd.c25
-rw-r--r--source/libvpx/vp9/encoder/vp9_rdopt.c294
-rw-r--r--source/libvpx/vp9/encoder/vp9_speed_features.c71
-rw-r--r--source/libvpx/vp9/encoder/vp9_speed_features.h20
-rw-r--r--source/libvpx/vp9/encoder/vp9_ssim.c2
-rw-r--r--source/libvpx/vp9/encoder/vp9_ssim.h2
-rw-r--r--source/libvpx/vp9/encoder/vp9_svc_layercontext.c80
-rw-r--r--source/libvpx/vp9/encoder/vp9_svc_layercontext.h6
-rw-r--r--source/libvpx/vp9/encoder/vp9_temporal_filter.c44
-rw-r--r--source/libvpx/vp9/encoder/vp9_variance.c3
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c71
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm427
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c295
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm510
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm401
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c103
-rw-r--r--source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c188
-rw-r--r--source/libvpx/vp9/vp9_common.mk1
-rw-r--r--source/libvpx/vp9/vp9_cx_iface.c250
-rw-r--r--source/libvpx/vp9/vp9_dx_iface.c59
-rw-r--r--source/libvpx/vp9/vp9cx.mk4
-rw-r--r--source/libvpx/vpx/internal/vpx_codec_internal.h6
-rw-r--r--source/libvpx/vpx/src/svc_encodeframe.c38
-rw-r--r--source/libvpx/vpx/src/vpx_codec.c5
-rw-r--r--source/libvpx/vpx/src/vpx_decoder.c16
-rw-r--r--source/libvpx/vpx/src/vpx_encoder.c18
-rw-r--r--source/libvpx/vpx/src/vpx_image.c2
-rw-r--r--source/libvpx/vpx/svc_context.h3
-rw-r--r--source/libvpx/vpx/vp8dx.h4
-rw-r--r--source/libvpx/vpx/vpx_codec.h14
-rw-r--r--source/libvpx/vpx/vpx_decoder.h2
-rw-r--r--source/libvpx/vpx/vpx_encoder.h31
-rw-r--r--source/libvpx/vpx/vpx_image.h14
-rw-r--r--source/libvpx/vpx_mem/vpx_mem.c24
-rw-r--r--source/libvpx/vpx_mem/vpx_mem.h3
-rw-r--r--source/libvpx/vpx_scale/generic/yv12config.c54
-rw-r--r--source/libvpx/vpx_scale/generic/yv12extend.c141
-rw-r--r--source/libvpx/vpx_scale/yv12config.h8
-rw-r--r--source/libvpx/vpxdec.c261
-rw-r--r--source/libvpx/vpxenc.c485
-rw-r--r--source/libvpx/y4minput.c6
230 files changed, 8210 insertions, 8057 deletions
diff --git a/README.chromium b/README.chromium
index 749aa13..d1b5262 100644
--- a/README.chromium
+++ b/README.chromium
@@ -5,9 +5,9 @@ License: BSD
License File: source/libvpx/LICENSE
Security Critical: yes
-Date: Thursday August 21 2014
+Date: Monday September 08 2014
Branch: master
-Commit: 23c88870ec514b0dd7d22b9db99ae63f46c7d87f
+Commit: c731d6a4f19eea861ceb2ff31399420b2452eb74
Description:
Contains the sources used to compile libvpx binaries used by Google Chrome and
diff --git a/libvpx_srcs.gni b/libvpx_srcs.gni
index 30c227d..39e8a68 100644
--- a/libvpx_srcs.gni
+++ b/libvpx_srcs.gni
@@ -347,7 +347,6 @@ libvpx_srcs_x86_assembly = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm",
@@ -355,8 +354,6 @@ libvpx_srcs_x86_assembly = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_ports/emms.asm",
"//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm",
]
@@ -364,7 +361,6 @@ libvpx_srcs_x86_mmx = [
"//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c",
"//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c",
]
libvpx_srcs_x86_sse2 = [
"//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -743,7 +739,6 @@ libvpx_srcs_x86_64_assembly = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm",
@@ -752,8 +747,6 @@ libvpx_srcs_x86_64_assembly = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_ports/emms.asm",
"//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm",
]
@@ -761,7 +754,6 @@ libvpx_srcs_x86_64_mmx = [
"//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c",
"//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c",
]
libvpx_srcs_x86_64_sse2 = [
"//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -1157,20 +1149,19 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.h",
@@ -1250,10 +1241,8 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm",
+ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c",
@@ -1538,13 +1527,6 @@ libvpx_srcs_arm_neon_cpu_detect = [
"//third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm",
"//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.h",
@@ -1623,9 +1605,6 @@ libvpx_srcs_arm_neon_cpu_detect = [
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm",
- "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c",
@@ -1879,14 +1858,21 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c",
"//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c",
@@ -1909,13 +1895,20 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c",
"//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.c",
"//third_party/libvpx/source/libvpx/vp8/common/blockd.h",
@@ -1989,6 +1982,7 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vp8/decoder/treereader.h",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c",
+ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c",
"//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c",
diff --git a/libvpx_srcs_arm64.gypi b/libvpx_srcs_arm64.gypi
index 545ff22..a6c51b1 100644
--- a/libvpx_srcs_arm64.gypi
+++ b/libvpx_srcs_arm64.gypi
@@ -15,13 +15,20 @@
'<(libvpx_source)/vp8/common/arm/neon/dc_only_idct_add_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sad_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/variance_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
'<(libvpx_source)/vp8/common/arm/variance_arm.c',
'<(libvpx_source)/vp8/common/blockd.c',
'<(libvpx_source)/vp8/common/blockd.h',
@@ -95,6 +102,7 @@
'<(libvpx_source)/vp8/decoder/treereader.h',
'<(libvpx_source)/vp8/encoder/arm/dct_arm.c',
'<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c',
+ '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
'<(libvpx_source)/vp8/encoder/arm/quantize_arm.c',
diff --git a/libvpx_srcs_arm_neon.gypi b/libvpx_srcs_arm_neon.gypi
index 80973bb..2ce983a 100644
--- a/libvpx_srcs_arm_neon.gypi
+++ b/libvpx_srcs_arm_neon.gypi
@@ -39,20 +39,19 @@
'<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c',
- '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c',
- '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.asm',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
- '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sad_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/variance_neon.c',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
+ '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
'<(libvpx_source)/vp8/common/arm/variance_arm.c',
'<(libvpx_source)/vp8/common/blockd.c',
'<(libvpx_source)/vp8/common/blockd.h',
@@ -132,10 +131,8 @@
'<(libvpx_source)/vp8/encoder/arm/dct_arm.c',
'<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.asm',
- '<(libvpx_source)/vp8/encoder/arm/neon/picklpf_arm.c',
- '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.asm',
+ '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c',
- '<(libvpx_source)/vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
'<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
'<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
'<(libvpx_source)/vp8/encoder/arm/quantize_arm.c',
diff --git a/libvpx_srcs_arm_neon_cpu_detect.gypi b/libvpx_srcs_arm_neon_cpu_detect.gypi
index 796f50b..2af52a7 100644
--- a/libvpx_srcs_arm_neon_cpu_detect.gypi
+++ b/libvpx_srcs_arm_neon_cpu_detect.gypi
@@ -33,13 +33,6 @@
'<(libvpx_source)/vp8/common/arm/dequantize_arm.c',
'<(libvpx_source)/vp8/common/arm/filter_arm.c',
'<(libvpx_source)/vp8/common/arm/loopfilter_arm.c',
- '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
- '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
'<(libvpx_source)/vp8/common/arm/variance_arm.c',
'<(libvpx_source)/vp8/common/blockd.c',
'<(libvpx_source)/vp8/common/blockd.h',
@@ -118,9 +111,6 @@
'<(libvpx_source)/vp8/encoder/arm/armv6/walsh_v6.asm',
'<(libvpx_source)/vp8/encoder/arm/dct_arm.c',
'<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.asm',
- '<(libvpx_source)/vp8/encoder/arm/neon/picklpf_arm.c',
- '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.asm',
- '<(libvpx_source)/vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
'<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
'<(libvpx_source)/vp8/encoder/arm/quantize_arm.c',
'<(libvpx_source)/vp8/encoder/bitstream.c',
diff --git a/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi b/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi
index 2fa1ba6..07eab36 100644
--- a/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi
+++ b/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi
@@ -19,14 +19,21 @@
'<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sad_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c',
'<(libvpx_source)/vp8/common/arm/neon/variance_neon.c',
+ '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c',
+ '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c',
'<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
'<(libvpx_source)/vp9/common/arm/neon/vp9_convolve_neon.c',
diff --git a/libvpx_srcs_x86.gypi b/libvpx_srcs_x86.gypi
index 91791a6..f5b3995 100644
--- a/libvpx_srcs_x86.gypi
+++ b/libvpx_srcs_x86.gypi
@@ -309,7 +309,6 @@
'<(libvpx_source)/vp9/encoder/x86/vp9_dct_mmx.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad4d_sse2.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_sad_mmx.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse3.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse4.asm',
@@ -317,8 +316,6 @@
'<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_subtract_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_mmx.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_sse2.asm',
'<(libvpx_source)/vp9/vp9_cx_iface.c',
'<(libvpx_source)/vp9/vp9_dx_iface.c',
'<(libvpx_source)/vp9/vp9_iface_common.h',
diff --git a/libvpx_srcs_x86_64.gypi b/libvpx_srcs_x86_64.gypi
index 07c46a7..f29c6c8 100644
--- a/libvpx_srcs_x86_64.gypi
+++ b/libvpx_srcs_x86_64.gypi
@@ -314,7 +314,6 @@
'<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad4d_sse2.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_sad_mmx.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse3.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse4.asm',
@@ -323,8 +322,6 @@
'<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_subtract_sse2.asm',
'<(libvpx_source)/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_mmx.asm',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_sse2.asm',
'<(libvpx_source)/vp9/vp9_cx_iface.c',
'<(libvpx_source)/vp9/vp9_dx_iface.c',
'<(libvpx_source)/vp9/vp9_iface_common.h',
diff --git a/libvpx_srcs_x86_64_intrinsics.gypi b/libvpx_srcs_x86_64_intrinsics.gypi
index a47c1e5..bb1b203 100644
--- a/libvpx_srcs_x86_64_intrinsics.gypi
+++ b/libvpx_srcs_x86_64_intrinsics.gypi
@@ -16,7 +16,6 @@
'<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c',
'<(libvpx_source)/vp8/common/x86/variance_mmx.c',
'<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_mmx.c',
],
'cflags': [ '-mmmx', ],
'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] },
diff --git a/libvpx_srcs_x86_intrinsics.gypi b/libvpx_srcs_x86_intrinsics.gypi
index a47c1e5..bb1b203 100644
--- a/libvpx_srcs_x86_intrinsics.gypi
+++ b/libvpx_srcs_x86_intrinsics.gypi
@@ -16,7 +16,6 @@
'<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c',
'<(libvpx_source)/vp8/common/x86/variance_mmx.c',
'<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c',
- '<(libvpx_source)/vp9/encoder/x86/vp9_variance_mmx.c',
],
'cflags': [ '-mmmx', ],
'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] },
diff --git a/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h
index 67936cc..9e41308 100644
--- a/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h
+++ b/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h
@@ -59,10 +59,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff);
#define vp8_block_error vp8_block_error_c
void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
void vp8_clear_system_state_c();
#define vp8_clear_system_state vp8_clear_system_state_c
@@ -420,10 +422,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr,
unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-RTCD_EXTERN void (*vp8_yv12_copy_partial_frame)(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-
void vp8_rtcd(void);
#include "vpx_config.h"
@@ -444,6 +442,10 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon;
vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_armv6;
if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon;
+ vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c;
+ if (flags & HAS_NEON) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_neon;
+ vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c;
+ if (flags & HAS_NEON) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_neon;
vp8_copy_mem16x16 = vp8_copy_mem16x16_v6;
if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon;
vp8_copy_mem8x4 = vp8_copy_mem8x4_v6;
@@ -544,8 +546,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_neon;
vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_armv6;
if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_neon;
- vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_c;
- if (flags & HAS_NEON) vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_neon;
}
#endif
diff --git a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index fac25a0..c4da123 100644
--- a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
index f8b3a15..5e15c83 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.h b/source/config/linux/arm-neon-cpu-detect/vpx_config.h
index 9cfd076..0bb6cee 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.h
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/arm-neon/vp8_rtcd.h b/source/config/linux/arm-neon/vp8_rtcd.h
index 00f2e61..703294a 100644
--- a/source/config/linux/arm-neon/vp8_rtcd.h
+++ b/source/config/linux/arm-neon/vp8_rtcd.h
@@ -59,10 +59,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff);
#define vp8_block_error vp8_block_error_c
void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
void vp8_clear_system_state_c();
#define vp8_clear_system_state vp8_clear_system_state_c
@@ -420,10 +422,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr,
unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_neon
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/arm-neon/vp9_rtcd.h b/source/config/linux/arm-neon/vp9_rtcd.h
index ff6a27e..cd2cc54 100644
--- a/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/source/config/linux/arm-neon/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/arm-neon/vpx_config.asm b/source/config/linux/arm-neon/vpx_config.asm
index a9eab27..fbd36f1 100644
--- a/source/config/linux/arm-neon/vpx_config.asm
+++ b/source/config/linux/arm-neon/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/linux/arm-neon/vpx_config.h b/source/config/linux/arm-neon/vpx_config.h
index c497ddb..b858039 100644
--- a/source/config/linux/arm-neon/vpx_config.h
+++ b/source/config/linux/arm-neon/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/arm/vp8_rtcd.h b/source/config/linux/arm/vp8_rtcd.h
index ec35c11..780d938 100644
--- a/source/config/linux/arm/vp8_rtcd.h
+++ b/source/config/linux/arm/vp8_rtcd.h
@@ -366,9 +366,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/arm/vp9_rtcd.h b/source/config/linux/arm/vp9_rtcd.h
index 0ebc52b..2be563e 100644
--- a/source/config/linux/arm/vp9_rtcd.h
+++ b/source/config/linux/arm/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/arm/vpx_config.asm b/source/config/linux/arm/vpx_config.asm
index dd8be51..d8c8989 100644
--- a/source/config/linux/arm/vpx_config.asm
+++ b/source/config/linux/arm/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/linux/arm/vpx_config.h b/source/config/linux/arm/vpx_config.h
index ee5f10d..5967658 100644
--- a/source/config/linux/arm/vpx_config.h
+++ b/source/config/linux/arm/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/arm64/vp8_rtcd.h b/source/config/linux/arm64/vp8_rtcd.h
index 9d45b89..f1b86d0 100644
--- a/source/config/linux/arm64/vp8_rtcd.h
+++ b/source/config/linux/arm64/vp8_rtcd.h
@@ -55,10 +55,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff);
#define vp8_block_error vp8_block_error_c
void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
void vp8_clear_system_state_c();
#define vp8_clear_system_state vp8_clear_system_state_c
@@ -92,10 +94,12 @@ void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, i
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
void vp8_dequantize_b_c(struct blockd*, short *dqc);
void vp8_dequantize_b_neon(struct blockd*, short *dqc);
@@ -132,10 +136,12 @@ void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left
#define vp8_intra4x4_predict vp8_intra4x4_predict_c
void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
@@ -150,14 +156,16 @@ void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char
#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
int vp8_mbblock_error_c(struct macroblock *mb, int dc);
#define vp8_mbblock_error vp8_mbblock_error_c
@@ -267,10 +275,12 @@ void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned
#define vp8_sad8x8x8 vp8_sad8x8x8_c
void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
@@ -360,9 +370,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/arm64/vp9_rtcd.h b/source/config/linux/arm64/vp9_rtcd.h
index 582837a..176e7af 100644
--- a/source/config/linux/arm64/vp9_rtcd.h
+++ b/source/config/linux/arm64/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/arm64/vpx_config.asm b/source/config/linux/arm64/vpx_config.asm
index bb141a1..a03bced 100644
--- a/source/config/linux/arm64/vpx_config.asm
+++ b/source/config/linux/arm64/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/linux/arm64/vpx_config.h b/source/config/linux/arm64/vpx_config.h
index e791223..06f3045 100644
--- a/source/config/linux/arm64/vpx_config.h
+++ b/source/config/linux/arm64/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/generic/vp8_rtcd.h b/source/config/linux/generic/vp8_rtcd.h
index 298886d..79edff7 100644
--- a/source/config/linux/generic/vp8_rtcd.h
+++ b/source/config/linux/generic/vp8_rtcd.h
@@ -323,9 +323,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/generic/vp9_rtcd.h b/source/config/linux/generic/vp9_rtcd.h
index c2df3fb..5c9b779 100644
--- a/source/config/linux/generic/vp9_rtcd.h
+++ b/source/config/linux/generic/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/generic/vpx_config.asm b/source/config/linux/generic/vpx_config.asm
index 42f23e4..b2fa7be 100644
--- a/source/config/linux/generic/vpx_config.asm
+++ b/source/config/linux/generic/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/linux/generic/vpx_config.h b/source/config/linux/generic/vpx_config.h
index 75d1415..a16afde 100644
--- a/source/config/linux/generic/vpx_config.h
+++ b/source/config/linux/generic/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/ia32/vp8_rtcd.h b/source/config/linux/ia32/vp8_rtcd.h
index 4dc2d75..fd88326 100644
--- a/source/config/linux/ia32/vp8_rtcd.h
+++ b/source/config/linux/ia32/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/linux/ia32/vp9_rtcd.h b/source/config/linux/ia32/vp9_rtcd.h
index 5d4bb2f..aa34a25 100644
--- a/source/config/linux/ia32/vp9_rtcd.h
+++ b/source/config/linux/ia32/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -286,12 +277,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8
RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
unsigned int vp9_get_mb_ss_sse2(const int16_t *);
RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *);
@@ -420,18 +409,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
+unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
+unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
+unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_b vp9_quantize_b_c
@@ -449,7 +440,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -482,7 +472,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -545,7 +534,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -610,7 +598,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -645,7 +632,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -834,7 +820,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -843,7 +828,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -860,7 +844,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -877,7 +860,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -886,7 +868,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con
RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -994,10 +975,8 @@ static void setup_rtcd_internal(void)
vp9_get16x16var = vp9_get16x16var_c;
if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2;
vp9_get8x8var = vp9_get8x8var_c;
- if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx;
if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2;
vp9_get_mb_ss = vp9_get_mb_ss_c;
- if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx;
if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2;
vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c;
if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3;
@@ -1060,10 +1039,14 @@ static void setup_rtcd_internal(void)
vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2;
vp9_mse16x16 = vp9_mse16x16_c;
- if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx;
if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2;
+ vp9_mse16x8 = vp9_mse16x8_c;
+ if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2;
+ vp9_mse8x16 = vp9_mse8x16_c;
+ if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2;
+ vp9_mse8x8 = vp9_mse8x8_c;
+ if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2;
vp9_sad16x16 = vp9_sad16x16_c;
- if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx;
if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2;
vp9_sad16x16_avg = vp9_sad16x16_avg_c;
if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2;
@@ -1079,7 +1062,6 @@ static void setup_rtcd_internal(void)
vp9_sad16x32x4d = vp9_sad16x32x4d_c;
if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2;
vp9_sad16x8 = vp9_sad16x8_c;
- if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx;
if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2;
vp9_sad16x8_avg = vp9_sad16x8_avg_c;
if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2;
@@ -1107,7 +1089,6 @@ static void setup_rtcd_internal(void)
vp9_sad32x64x4d = vp9_sad32x64x4d_c;
if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2;
vp9_sad4x4 = vp9_sad4x4_c;
- if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx;
if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse;
vp9_sad4x4_avg = vp9_sad4x4_avg_c;
if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse;
@@ -1134,7 +1115,6 @@ static void setup_rtcd_internal(void)
vp9_sad64x64x4d = vp9_sad64x64x4d_c;
if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2;
vp9_sad8x16 = vp9_sad8x16_c;
- if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx;
if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2;
vp9_sad8x16_avg = vp9_sad8x16_avg_c;
if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2;
@@ -1149,7 +1129,6 @@ static void setup_rtcd_internal(void)
vp9_sad8x4x4d = vp9_sad8x4x4d_c;
if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2;
vp9_sad8x8 = vp9_sad8x8_c;
- if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx;
if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2;
vp9_sad8x8_avg = vp9_sad8x8_avg_c;
if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2;
@@ -1254,12 +1233,10 @@ static void setup_rtcd_internal(void)
vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c;
if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse;
vp9_variance16x16 = vp9_variance16x16_c;
- if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx;
if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2;
vp9_variance16x32 = vp9_variance16x32_c;
if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2;
vp9_variance16x8 = vp9_variance16x8_c;
- if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx;
if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2;
vp9_variance32x16 = vp9_variance32x16_c;
if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2;
@@ -1268,7 +1245,6 @@ static void setup_rtcd_internal(void)
vp9_variance32x64 = vp9_variance32x64_c;
if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2;
vp9_variance4x4 = vp9_variance4x4_c;
- if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx;
if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2;
vp9_variance4x8 = vp9_variance4x8_c;
if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2;
@@ -1277,12 +1253,10 @@ static void setup_rtcd_internal(void)
vp9_variance64x64 = vp9_variance64x64_c;
if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2;
vp9_variance8x16 = vp9_variance8x16_c;
- if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx;
if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2;
vp9_variance8x4 = vp9_variance8x4_c;
if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2;
vp9_variance8x8 = vp9_variance8x8_c;
- if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx;
if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2;
}
#endif
diff --git a/source/config/linux/ia32/vpx_config.asm b/source/config/linux/ia32/vpx_config.asm
index ddde8b0..a340007 100644
--- a/source/config/linux/ia32/vpx_config.asm
+++ b/source/config/linux/ia32/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/ia32/vpx_config.h b/source/config/linux/ia32/vpx_config.h
index 705af6e..5b8fc38 100644
--- a/source/config/linux/ia32/vpx_config.h
+++ b/source/config/linux/ia32/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/mips64el/vp8_rtcd.h b/source/config/linux/mips64el/vp8_rtcd.h
index 58dc2fb..9848bb8 100644
--- a/source/config/linux/mips64el/vp8_rtcd.h
+++ b/source/config/linux/mips64el/vp8_rtcd.h
@@ -326,9 +326,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/mips64el/vp9_rtcd.h b/source/config/linux/mips64el/vp9_rtcd.h
index c2df3fb..5c9b779 100644
--- a/source/config/linux/mips64el/vp9_rtcd.h
+++ b/source/config/linux/mips64el/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/mips64el/vpx_config.h b/source/config/linux/mips64el/vpx_config.h
index 934484e..736b66a 100644
--- a/source/config/linux/mips64el/vpx_config.h
+++ b/source/config/linux/mips64el/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/mipsel/vp8_rtcd.h b/source/config/linux/mipsel/vp8_rtcd.h
index 58dc2fb..9848bb8 100644
--- a/source/config/linux/mipsel/vp8_rtcd.h
+++ b/source/config/linux/mipsel/vp8_rtcd.h
@@ -326,9 +326,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/linux/mipsel/vp9_rtcd.h b/source/config/linux/mipsel/vp9_rtcd.h
index c2df3fb..5c9b779 100644
--- a/source/config/linux/mipsel/vp9_rtcd.h
+++ b/source/config/linux/mipsel/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/linux/mipsel/vpx_config.h b/source/config/linux/mipsel/vpx_config.h
index 5e0b6f2..e0bb723 100644
--- a/source/config/linux/mipsel/vpx_config.h
+++ b/source/config/linux/mipsel/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/x64/vp8_rtcd.h b/source/config/linux/x64/vp8_rtcd.h
index 7caa03a..b2fd3d2 100644
--- a/source/config/linux/x64/vp8_rtcd.h
+++ b/source/config/linux/x64/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/linux/x64/vp9_rtcd.h b/source/config/linux/x64/vp9_rtcd.h
index ed9a72b..4e8678a 100644
--- a/source/config/linux/x64/vp9_rtcd.h
+++ b/source/config/linux/x64/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_sse2
@@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8
#define vp9_get16x16var vp9_get16x16var_sse2
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
#define vp9_get8x8var vp9_get8x8var_sse2
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
unsigned int vp9_get_mb_ss_sse2(const int16_t *);
#define vp9_get_mb_ss vp9_get_mb_ss_sse2
@@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define vp9_mse16x16 vp9_mse16x16_sse2
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
+unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse16x8 vp9_mse16x8_sse2
unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
+unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x16 vp9_mse8x16_sse2
unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
+unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x8 vp9_mse8x8_sse2
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x16 vp9_sad16x16_sse2
@@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x8 vp9_sad16x8_sse2
@@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad4x4 vp9_sad4x4_sse
@@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x16 vp9_sad8x16_sse2
@@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x8 vp9_sad8x8_sse2
@@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x16 vp9_variance16x16_sse2
@@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance16x32 vp9_variance16x32_sse2
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x8 vp9_variance16x8_sse2
@@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance32x64 vp9_variance32x64_sse2
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance4x4 vp9_variance4x4_sse2
@@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance64x64 vp9_variance64x64_sse2
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x16 vp9_variance8x16_sse2
@@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con
#define vp9_variance8x4 vp9_variance8x4_sse2
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x8 vp9_variance8x8_sse2
diff --git a/source/config/linux/x64/vpx_config.asm b/source/config/linux/x64/vpx_config.asm
index c34dcd3..1cc8999 100644
--- a/source/config/linux/x64/vpx_config.asm
+++ b/source/config/linux/x64/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/linux/x64/vpx_config.h b/source/config/linux/x64/vpx_config.h
index 8b99a23..e88c097 100644
--- a/source/config/linux/x64/vpx_config.h
+++ b/source/config/linux/x64/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/mac/ia32/vp8_rtcd.h b/source/config/mac/ia32/vp8_rtcd.h
index 4dc2d75..fd88326 100644
--- a/source/config/mac/ia32/vp8_rtcd.h
+++ b/source/config/mac/ia32/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/mac/ia32/vp9_rtcd.h b/source/config/mac/ia32/vp9_rtcd.h
index bd56bc3..fc9dc85 100644
--- a/source/config/mac/ia32/vp9_rtcd.h
+++ b/source/config/mac/ia32/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
@@ -262,13 +253,10 @@ void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t
#define vp9_get16x16var vp9_get16x16var_c
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vp9_get8x8var vp9_get8x8var_c
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
-unsigned int vp9_get_mb_ss_sse2(const int16_t *);
-RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *);
+#define vp9_get_mb_ss vp9_get_mb_ss_c
void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
@@ -391,8 +379,7 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse16x16 vp9_mse16x16_c
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define vp9_mse16x8 vp9_mse16x8_c
@@ -419,8 +406,7 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vp9_sad16x16 vp9_sad16x16_c
unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vp9_sad16x16_avg vp9_sad16x16_avg_c
@@ -448,8 +434,7 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vp9_sad16x8 vp9_sad16x8_c
unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vp9_sad16x8_avg vp9_sad16x8_avg_c
@@ -503,8 +488,7 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vp9_sad4x4 vp9_sad4x4_c
unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vp9_sad4x4_avg vp9_sad4x4_avg_c
@@ -560,8 +544,7 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vp9_sad8x16 vp9_sad8x16_c
unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vp9_sad8x16_avg vp9_sad8x16_avg_c
@@ -591,8 +574,7 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vp9_sad8x8 vp9_sad8x8_c
unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vp9_sad8x8_avg vp9_sad8x8_avg_c
@@ -718,15 +700,13 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vp9_variance16x16 vp9_variance16x16_c
unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x32 vp9_variance16x32_c
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vp9_variance16x8 vp9_variance16x8_c
unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance32x16 vp9_variance32x16_c
@@ -738,8 +718,7 @@ unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, cons
#define vp9_variance32x64 vp9_variance32x64_c
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vp9_variance4x4 vp9_variance4x4_c
unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance4x8 vp9_variance4x8_c
@@ -751,15 +730,13 @@ unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, cons
#define vp9_variance64x64 vp9_variance64x64_c
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vp9_variance8x16 vp9_variance8x16_c
unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x4 vp9_variance8x4_c
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vp9_variance8x8 vp9_variance8x8_c
void vp9_rtcd(void);
@@ -816,11 +793,6 @@ static void setup_rtcd_internal(void)
vp9_full_search_sad = vp9_full_search_sad_c;
if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
- vp9_get8x8var = vp9_get8x8var_c;
- if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx;
- vp9_get_mb_ss = vp9_get_mb_ss_c;
- if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx;
- if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2;
vp9_idct16x16_10_add = vp9_idct16x16_10_add_c;
if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2;
if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3;
@@ -873,10 +845,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2;
vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2;
- vp9_mse16x16 = vp9_mse16x16_c;
- if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx;
- vp9_sad16x16 = vp9_sad16x16_c;
- if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx;
vp9_sad16x16x3 = vp9_sad16x16x3_c;
if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3;
if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3;
@@ -884,8 +852,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2;
vp9_sad16x32x4d = vp9_sad16x32x4d_c;
if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2;
- vp9_sad16x8 = vp9_sad16x8_c;
- if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx;
vp9_sad16x8x3 = vp9_sad16x8x3_c;
if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3;
if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3;
@@ -897,8 +863,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2;
vp9_sad32x64x4d = vp9_sad32x64x4d_c;
if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2;
- vp9_sad4x4 = vp9_sad4x4_c;
- if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx;
vp9_sad4x4x3 = vp9_sad4x4x3_c;
if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3;
vp9_sad4x4x4d = vp9_sad4x4x4d_c;
@@ -909,32 +873,18 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2;
vp9_sad64x64x4d = vp9_sad64x64x4d_c;
if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2;
- vp9_sad8x16 = vp9_sad8x16_c;
- if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx;
vp9_sad8x16x3 = vp9_sad8x16x3_c;
if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3;
vp9_sad8x16x4d = vp9_sad8x16x4d_c;
if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2;
vp9_sad8x4x4d = vp9_sad8x4x4d_c;
if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2;
- vp9_sad8x8 = vp9_sad8x8_c;
- if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx;
vp9_sad8x8x3 = vp9_sad8x8x3_c;
if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3;
vp9_sad8x8x4d = vp9_sad8x8x4d_c;
if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2;
vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
- vp9_variance16x16 = vp9_variance16x16_c;
- if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx;
- vp9_variance16x8 = vp9_variance16x8_c;
- if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx;
- vp9_variance4x4 = vp9_variance4x4_c;
- if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx;
- vp9_variance8x16 = vp9_variance8x16_c;
- if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx;
- vp9_variance8x8 = vp9_variance8x8_c;
- if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx;
}
#endif
diff --git a/source/config/mac/ia32/vpx_config.asm b/source/config/mac/ia32/vpx_config.asm
index d06f05d..54a6abd 100644
--- a/source/config/mac/ia32/vpx_config.asm
+++ b/source/config/mac/ia32/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/mac/ia32/vpx_config.h b/source/config/mac/ia32/vpx_config.h
index 9b7b399..c3e8947 100644
--- a/source/config/mac/ia32/vpx_config.h
+++ b/source/config/mac/ia32/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/mac/x64/vp8_rtcd.h b/source/config/mac/x64/vp8_rtcd.h
index 7caa03a..b2fd3d2 100644
--- a/source/config/mac/x64/vp8_rtcd.h
+++ b/source/config/mac/x64/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/mac/x64/vp9_rtcd.h b/source/config/mac/x64/vp9_rtcd.h
index ed9a72b..4e8678a 100644
--- a/source/config/mac/x64/vp9_rtcd.h
+++ b/source/config/mac/x64/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_sse2
@@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8
#define vp9_get16x16var vp9_get16x16var_sse2
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
#define vp9_get8x8var vp9_get8x8var_sse2
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
unsigned int vp9_get_mb_ss_sse2(const int16_t *);
#define vp9_get_mb_ss vp9_get_mb_ss_sse2
@@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define vp9_mse16x16 vp9_mse16x16_sse2
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
+unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse16x8 vp9_mse16x8_sse2
unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
+unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x16 vp9_mse8x16_sse2
unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
+unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x8 vp9_mse8x8_sse2
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x16 vp9_sad16x16_sse2
@@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x8 vp9_sad16x8_sse2
@@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad4x4 vp9_sad4x4_sse
@@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x16 vp9_sad8x16_sse2
@@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x8 vp9_sad8x8_sse2
@@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x16 vp9_variance16x16_sse2
@@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance16x32 vp9_variance16x32_sse2
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x8 vp9_variance16x8_sse2
@@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance32x64 vp9_variance32x64_sse2
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance4x4 vp9_variance4x4_sse2
@@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance64x64 vp9_variance64x64_sse2
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x16 vp9_variance8x16_sse2
@@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con
#define vp9_variance8x4 vp9_variance8x4_sse2
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x8 vp9_variance8x8_sse2
diff --git a/source/config/mac/x64/vpx_config.asm b/source/config/mac/x64/vpx_config.asm
index c34dcd3..1cc8999 100644
--- a/source/config/mac/x64/vpx_config.asm
+++ b/source/config/mac/x64/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/mac/x64/vpx_config.h b/source/config/mac/x64/vpx_config.h
index 8b99a23..e88c097 100644
--- a/source/config/mac/x64/vpx_config.h
+++ b/source/config/mac/x64/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/nacl/vp8_rtcd.h b/source/config/nacl/vp8_rtcd.h
index 298886d..79edff7 100644
--- a/source/config/nacl/vp8_rtcd.h
+++ b/source/config/nacl/vp8_rtcd.h
@@ -323,9 +323,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int
unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#include "vpx_config.h"
diff --git a/source/config/nacl/vp9_rtcd.h b/source/config/nacl/vp9_rtcd.h
index c2df3fb..5c9b779 100644
--- a/source/config/nacl/vp9_rtcd.h
+++ b/source/config/nacl/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
diff --git a/source/config/nacl/vpx_config.asm b/source/config/nacl/vpx_config.asm
index 42f23e4..b2fa7be 100644
--- a/source/config/nacl/vpx_config.asm
+++ b/source/config/nacl/vpx_config.asm
@@ -82,6 +82,7 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 0
.equ CONFIG_SPATIAL_SVC , 0
diff --git a/source/config/nacl/vpx_config.h b/source/config/nacl/vpx_config.h
index 75d1415..a16afde 100644
--- a/source/config/nacl/vpx_config.h
+++ b/source/config/nacl/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/win/ia32/vp8_rtcd.h b/source/config/win/ia32/vp8_rtcd.h
index 4dc2d75..fd88326 100644
--- a/source/config/win/ia32/vp8_rtcd.h
+++ b/source/config/win/ia32/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/win/ia32/vp9_rtcd.h b/source/config/win/ia32/vp9_rtcd.h
index 5d4bb2f..aa34a25 100644
--- a/source/config/win/ia32/vp9_rtcd.h
+++ b/source/config/win/ia32/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -286,12 +277,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8
RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
unsigned int vp9_get_mb_ss_sse2(const int16_t *);
RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *);
@@ -420,18 +409,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
+unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
+unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
+unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_b vp9_quantize_b_c
@@ -449,7 +440,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -482,7 +472,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -545,7 +534,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -610,7 +598,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -645,7 +632,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -834,7 +820,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -843,7 +828,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -860,7 +844,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -877,7 +860,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -886,7 +868,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con
RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -994,10 +975,8 @@ static void setup_rtcd_internal(void)
vp9_get16x16var = vp9_get16x16var_c;
if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2;
vp9_get8x8var = vp9_get8x8var_c;
- if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx;
if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2;
vp9_get_mb_ss = vp9_get_mb_ss_c;
- if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx;
if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2;
vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c;
if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3;
@@ -1060,10 +1039,14 @@ static void setup_rtcd_internal(void)
vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2;
vp9_mse16x16 = vp9_mse16x16_c;
- if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx;
if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2;
+ vp9_mse16x8 = vp9_mse16x8_c;
+ if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2;
+ vp9_mse8x16 = vp9_mse8x16_c;
+ if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2;
+ vp9_mse8x8 = vp9_mse8x8_c;
+ if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2;
vp9_sad16x16 = vp9_sad16x16_c;
- if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx;
if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2;
vp9_sad16x16_avg = vp9_sad16x16_avg_c;
if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2;
@@ -1079,7 +1062,6 @@ static void setup_rtcd_internal(void)
vp9_sad16x32x4d = vp9_sad16x32x4d_c;
if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2;
vp9_sad16x8 = vp9_sad16x8_c;
- if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx;
if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2;
vp9_sad16x8_avg = vp9_sad16x8_avg_c;
if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2;
@@ -1107,7 +1089,6 @@ static void setup_rtcd_internal(void)
vp9_sad32x64x4d = vp9_sad32x64x4d_c;
if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2;
vp9_sad4x4 = vp9_sad4x4_c;
- if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx;
if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse;
vp9_sad4x4_avg = vp9_sad4x4_avg_c;
if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse;
@@ -1134,7 +1115,6 @@ static void setup_rtcd_internal(void)
vp9_sad64x64x4d = vp9_sad64x64x4d_c;
if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2;
vp9_sad8x16 = vp9_sad8x16_c;
- if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx;
if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2;
vp9_sad8x16_avg = vp9_sad8x16_avg_c;
if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2;
@@ -1149,7 +1129,6 @@ static void setup_rtcd_internal(void)
vp9_sad8x4x4d = vp9_sad8x4x4d_c;
if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2;
vp9_sad8x8 = vp9_sad8x8_c;
- if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx;
if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2;
vp9_sad8x8_avg = vp9_sad8x8_avg_c;
if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2;
@@ -1254,12 +1233,10 @@ static void setup_rtcd_internal(void)
vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c;
if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse;
vp9_variance16x16 = vp9_variance16x16_c;
- if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx;
if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2;
vp9_variance16x32 = vp9_variance16x32_c;
if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2;
vp9_variance16x8 = vp9_variance16x8_c;
- if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx;
if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2;
vp9_variance32x16 = vp9_variance32x16_c;
if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2;
@@ -1268,7 +1245,6 @@ static void setup_rtcd_internal(void)
vp9_variance32x64 = vp9_variance32x64_c;
if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2;
vp9_variance4x4 = vp9_variance4x4_c;
- if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx;
if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2;
vp9_variance4x8 = vp9_variance4x8_c;
if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2;
@@ -1277,12 +1253,10 @@ static void setup_rtcd_internal(void)
vp9_variance64x64 = vp9_variance64x64_c;
if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2;
vp9_variance8x16 = vp9_variance8x16_c;
- if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx;
if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2;
vp9_variance8x4 = vp9_variance8x4_c;
if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2;
vp9_variance8x8 = vp9_variance8x8_c;
- if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx;
if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2;
}
#endif
diff --git a/source/config/win/ia32/vpx_config.asm b/source/config/win/ia32/vpx_config.asm
index 87a317c..9522ee1 100644
--- a/source/config/win/ia32/vpx_config.asm
+++ b/source/config/win/ia32/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/win/ia32/vpx_config.h b/source/config/win/ia32/vpx_config.h
index 601cd8d..8fef84f 100644
--- a/source/config/win/ia32/vpx_config.h
+++ b/source/config/win/ia32/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/win/x64/vp8_rtcd.h b/source/config/win/x64/vp8_rtcd.h
index 7caa03a..b2fd3d2 100644
--- a/source/config/win/x64/vp8_rtcd.h
+++ b/source/config/win/x64/vp8_rtcd.h
@@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in
unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse);
#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
void vp8_rtcd(void);
#ifdef RTCD_C
diff --git a/source/config/win/x64/vp9_rtcd.h b/source/config/win/x64/vp9_rtcd.h
index ed9a72b..4e8678a 100644
--- a/source/config/win/x64/vp9_rtcd.h
+++ b/source/config/win/x64/vp9_rtcd.h
@@ -28,15 +28,6 @@ struct mv;
union int_mv;
struct yv12_buffer_config;
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_sse2
@@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8
#define vp9_get16x16var vp9_get16x16var_sse2
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
#define vp9_get8x8var vp9_get8x8var_sse2
unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
unsigned int vp9_get_mb_ss_sse2(const int16_t *);
#define vp9_get_mb_ss vp9_get_mb_ss_sse2
@@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2
unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define vp9_mse16x16 vp9_mse16x16_sse2
unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
+unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse16x8 vp9_mse16x8_sse2
unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
+unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x16 vp9_mse8x16_sse2
unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
+unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vp9_mse8x8 vp9_mse8x8_sse2
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int
#define vp9_refining_search_sad vp9_refining_search_sad_c
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x16 vp9_sad16x16_sse2
@@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad16x8 vp9_sad16x8_sse2
@@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t
#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad4x4 vp9_sad4x4_sse
@@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re
#define vp9_sad64x64x8 vp9_sad64x64x8_c
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x16 vp9_sad8x16_sse2
@@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
#define vp9_sad8x4x8 vp9_sad8x4x8_c
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
#define vp9_sad8x8 vp9_sad8x8_sse2
@@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse
unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x16 vp9_variance16x16_sse2
@@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance16x32 vp9_variance16x32_sse2
unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance16x8 vp9_variance16x8_sse2
@@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance32x64 vp9_variance32x64_sse2
unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance4x4 vp9_variance4x4_sse2
@@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c
#define vp9_variance64x64 vp9_variance64x64_sse2
unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x16 vp9_variance8x16_sse2
@@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con
#define vp9_variance8x4 vp9_variance8x4_sse2
unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vp9_variance8x8 vp9_variance8x8_sse2
diff --git a/source/config/win/x64/vpx_config.asm b/source/config/win/x64/vpx_config.asm
index 9af49e7..f184ca7 100644
--- a/source/config/win/x64/vpx_config.asm
+++ b/source/config/win/x64/vpx_config.asm
@@ -79,6 +79,7 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 0
%define CONFIG_SPATIAL_SVC 0
diff --git a/source/config/win/x64/vpx_config.h b/source/config/win/x64/vpx_config.h
index 9747cec..75777f0 100644
--- a/source/config/win/x64/vpx_config.h
+++ b/source/config/win/x64/vpx_config.h
@@ -91,6 +91,7 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 0
#define CONFIG_SPATIAL_SVC 0
diff --git a/source/libvpx/build/make/gen_msvs_proj.sh b/source/libvpx/build/make/gen_msvs_proj.sh
index 3653309..7907225 100755
--- a/source/libvpx/build/make/gen_msvs_proj.sh
+++ b/source/libvpx/build/make/gen_msvs_proj.sh
@@ -245,13 +245,13 @@ esac
case "$target" in
x86_64*)
platforms[0]="x64"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
+ asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
;;
x86*)
platforms[0]="Win32"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
+ asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
;;
*) die "Unsupported target $target!"
;;
diff --git a/source/libvpx/build/make/gen_msvs_vcxproj.sh b/source/libvpx/build/make/gen_msvs_vcxproj.sh
index 23ef6a3..56b9a3b 100755
--- a/source/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/source/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -253,13 +253,13 @@ libs=${libs// /;}
case "$target" in
x86_64*)
platforms[0]="x64"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+ asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
;;
x86*)
platforms[0]="Win32"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+ asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
;;
arm*)
asm_Debug_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
diff --git a/source/libvpx/configure b/source/libvpx/configure
index 2708b45..32b70f1 100755
--- a/source/libvpx/configure
+++ b/source/libvpx/configure
@@ -334,6 +334,7 @@ CONFIG_LIST="
multi_res_encoding
temporal_denoising
coefficient_range_checking
+ vp9_highbitdepth
experimental
size_limit
${EXPERIMENT_LIST}
@@ -392,6 +393,7 @@ CMDLINE_SELECT="
multi_res_encoding
temporal_denoising
coefficient_range_checking
+ vp9_highbitdepth
experimental
"
diff --git a/source/libvpx/examples.mk b/source/libvpx/examples.mk
index 91bd45a..bd38c41 100644
--- a/source/libvpx/examples.mk
+++ b/source/libvpx/examples.mk
@@ -31,6 +31,7 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \
+ third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_posix.cc \
third_party/libyuv/source/scale_win.cc \
@@ -185,7 +186,9 @@ vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame
ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
ifeq ($(CONFIG_LIBYUV),yes)
EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c
vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c
+vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c
vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS)
vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
diff --git a/source/libvpx/examples/set_maps.c b/source/libvpx/examples/set_maps.c
index ff60d51..2ee5bca 100644
--- a/source/libvpx/examples/set_maps.c
+++ b/source/libvpx/examples/set_maps.c
@@ -42,6 +42,7 @@
// Use the `simple_decoder` example to decode this sample, and observe
// the change in the image at frames 22, 33, and 44.
+#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -177,9 +178,10 @@ int main(int argc, char **argv) {
memset(&info, 0, sizeof(info));
encoder = get_vpx_encoder_by_name(argv[1]);
- if (!encoder)
+ if (encoder == NULL) {
die("Unsupported codec.");
-
+ }
+ assert(encoder != NULL);
info.codec_fourcc = encoder->fourcc;
info.frame_width = strtol(argv[2], NULL, 0);
info.frame_height = strtol(argv[3], NULL, 0);
diff --git a/source/libvpx/examples/twopass_encoder.c b/source/libvpx/examples/twopass_encoder.c
index 369b1d8..76d5a28 100644
--- a/source/libvpx/examples/twopass_encoder.c
+++ b/source/libvpx/examples/twopass_encoder.c
@@ -66,13 +66,14 @@ void usage_exit() {
exit(EXIT_FAILURE);
}
-static void get_frame_stats(vpx_codec_ctx_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned int duration,
- vpx_enc_frame_flags_t flags,
- unsigned int deadline,
- vpx_fixed_buf_t *stats) {
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned int duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned int deadline,
+ vpx_fixed_buf_t *stats) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -81,6 +82,8 @@ static void get_frame_stats(vpx_codec_ctx_t *ctx,
die_codec(ctx, "Failed to get frame stats.");
while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+
if (pkt->kind == VPX_CODEC_STATS_PKT) {
const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -89,15 +92,18 @@ static void get_frame_stats(vpx_codec_ctx_t *ctx,
stats->sz += pkt_size;
}
}
+
+ return got_pkts;
}
-static void encode_frame(vpx_codec_ctx_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned int duration,
- vpx_enc_frame_flags_t flags,
- unsigned int deadline,
- VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned int duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned int deadline,
+ VpxVideoWriter *writer) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -106,6 +112,7 @@ static void encode_frame(vpx_codec_ctx_t *ctx,
die_codec(ctx, "Failed to encode frame.");
while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
@@ -117,19 +124,90 @@ static void encode_frame(vpx_codec_ctx_t *ctx,
fflush(stdout);
}
}
+
+ return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+ FILE *infile,
+ const VpxInterface *encoder,
+ const vpx_codec_enc_cfg_t *cfg) {
+ vpx_codec_ctx_t codec;
+ int frame_count = 0;
+ vpx_fixed_buf_t stats = {NULL, 0};
+
+ if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+ die_codec(&codec, "Failed to initialize encoder");
+
+ // Calculate frame statistics.
+ while (vpx_img_read(raw, infile)) {
+ ++frame_count;
+ get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
+ &stats);
+ }
+
+ // Flush encoder.
+ while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+ VPX_DL_BEST_QUALITY, &stats)) {}
+
+ printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec.");
+
+ return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+ FILE *infile,
+ const char *outfile_name,
+ const VpxInterface *encoder,
+ const vpx_codec_enc_cfg_t *cfg) {
+ VpxVideoInfo info = {
+ encoder->fourcc,
+ cfg->g_w,
+ cfg->g_h,
+ {cfg->g_timebase.num, cfg->g_timebase.den}
+ };
+ VpxVideoWriter *writer = NULL;
+ vpx_codec_ctx_t codec;
+ int frame_count = 0;
+
+ writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+ if (!writer)
+ die("Failed to open %s for writing", outfile_name);
+
+ if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+ die_codec(&codec, "Failed to initialize encoder");
+
+ // Encode frames.
+ while (vpx_img_read(raw, infile)) {
+ ++frame_count;
+ encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_BEST_QUALITY, writer)) {}
+
+ printf("\n");
+
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec.");
+
+ vpx_video_writer_close(writer);
+
+ printf("Pass 1 complete. Processed %d frames.\n", frame_count);
}
int main(int argc, char **argv) {
FILE *infile = NULL;
- VpxVideoWriter *writer = NULL;
+ int w, h;
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t cfg;
vpx_image_t raw;
vpx_codec_err_t res;
- vpx_fixed_buf_t stats = {0};
- VpxVideoInfo info = {0};
+ vpx_fixed_buf_t stats;
+
const VpxInterface *encoder = NULL;
- int pass;
const int fps = 30; // TODO(dkovalev) add command line argument
const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument
const char *const codec_arg = argv[1];
@@ -146,85 +224,44 @@ int main(int argc, char **argv) {
if (!encoder)
die("Unsupported codec.");
- info.codec_fourcc = encoder->fourcc;
- info.time_base.numerator = 1;
- info.time_base.denominator = fps;
- info.frame_width = strtol(width_arg, NULL, 0);
- info.frame_height = strtol(height_arg, NULL, 0);
-
- if (info.frame_width <= 0 ||
- info.frame_height <= 0 ||
- (info.frame_width % 2) != 0 ||
- (info.frame_height % 2) != 0) {
- die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
- }
+ w = strtol(width_arg, NULL, 0);
+ h = strtol(height_arg, NULL, 0);
- if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
- info.frame_height, 1)) {
- die("Failed to allocate image", info.frame_width, info.frame_height);
- }
+ if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+ die("Invalid frame size: %dx%d", w, h);
- writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
- if (!writer)
- die("Failed to open %s for writing", outfile_arg);
+ if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+ die("Failed to allocate image", w, h);
printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+ // Configuration
res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
if (res)
die_codec(&codec, "Failed to get default codec config.");
- cfg.g_w = info.frame_width;
- cfg.g_h = info.frame_height;
- cfg.g_timebase.num = info.time_base.numerator;
- cfg.g_timebase.den = info.time_base.denominator;
+ cfg.g_w = w;
+ cfg.g_h = h;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = fps;
cfg.rc_target_bitrate = bitrate;
- for (pass = 0; pass < 2; ++pass) {
- int frame_count = 0;
-
- if (pass == 0) {
- cfg.g_pass = VPX_RC_FIRST_PASS;
- } else {
- cfg.g_pass = VPX_RC_LAST_PASS;
- cfg.rc_twopass_stats_in = stats;
- }
-
- if (!(infile = fopen(infile_arg, "rb")))
- die("Failed to open %s for reading", infile_arg);
-
- if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
- die_codec(&codec, "Failed to initialize encoder");
-
- while (vpx_img_read(&raw, infile)) {
- ++frame_count;
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading", infile_arg);
- if (pass == 0) {
- get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- &stats);
- } else {
- encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- writer);
- }
- }
-
- if (pass == 0) {
- get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- &stats);
- } else {
- printf("\n");
- }
+ // Pass 0
+ cfg.g_pass = VPX_RC_FIRST_PASS;
+ stats = pass0(&raw, infile, encoder, &cfg);
- fclose(infile);
- printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count);
- if (vpx_codec_destroy(&codec))
- die_codec(&codec, "Failed to destroy codec.");
- }
-
- vpx_img_free(&raw);
+ // Pass 1
+ rewind(infile);
+ cfg.g_pass = VPX_RC_LAST_PASS;
+ cfg.rc_twopass_stats_in = stats;
+ pass1(&raw, infile, outfile_arg, encoder, &cfg);
free(stats.buf);
- vpx_video_writer_close(writer);
+ vpx_img_free(&raw);
+ fclose(infile);
return EXIT_SUCCESS;
}
diff --git a/source/libvpx/examples/vp8_multi_resolution_encoder.c b/source/libvpx/examples/vp8_multi_resolution_encoder.c
index d41e442..7c050fa 100644
--- a/source/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/source/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -8,446 +8,293 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-/*
- * This is an example demonstrating multi-resolution encoding in VP8.
- * High-resolution input video is down-sampled to lower-resolutions. The
- * encoder then encodes the video and outputs multiple bitstreams with
- * different resolutions.
- */
+
+// This is an example demonstrating multi-resolution encoding in VP8.
+// High-resolution input video is down-sampled to lower-resolutions. The
+// encoder then encodes the video and outputs multiple bitstreams with
+// different resolutions.
+//
+// Configure with --enable-multi-res-encoding flag to enable this example.
+
#include <stdio.h>
#include <stdlib.h>
-#include <stdarg.h>
#include <string.h>
-#include <math.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
-#include "vpx_ports/mem_ops.h"
-#include "./tools_common.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc 0x30385056
-
-void usage_exit() {
- exit(EXIT_FAILURE);
-}
-
-/*
- * The input video frame is downsampled several times to generate a multi-level
- * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
- * levels required. For example, if the size of input video is 1280x720,
- * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
- * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
- * 320x180(level 2) respectively.
- */
-#define NUM_ENCODERS 3
-/* This example uses the scaler function in libyuv. */
#include "third_party/libyuv/include/libyuv/basic_types.h"
#include "third_party/libyuv/include/libyuv/scale.h"
#include "third_party/libyuv/include/libyuv/cpu_id.h"
-int (*read_frame_p)(FILE *f, vpx_image_t *img);
-
-static int read_frame(FILE *f, vpx_image_t *img) {
- size_t nbytes, to_read;
- int res = 1;
-
- to_read = img->w*img->h*3/2;
- nbytes = fread(img->planes[0], 1, to_read, f);
- if(nbytes != to_read) {
- res = 0;
- if(nbytes > 0)
- printf("Warning: Read partial frame. Check your width & height!\n");
- }
- return res;
-}
-
-static int read_frame_by_row(FILE *f, vpx_image_t *img) {
- size_t nbytes, to_read;
- int res = 1;
- int plane;
-
- for (plane = 0; plane < 3; plane++)
- {
- unsigned char *ptr;
- int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
- int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
- int r;
-
- /* Determine the correct plane based on the image format. The for-loop
- * always counts in Y,U,V order, but this may not match the order of
- * the data on disk.
- */
- switch (plane)
- {
- case 1:
- ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
- break;
- case 2:
- ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
- break;
- default:
- ptr = img->planes[plane];
- }
-
- for (r = 0; r < h; r++)
- {
- to_read = w;
-
- nbytes = fread(ptr, 1, to_read, f);
- if(nbytes != to_read) {
- res = 0;
- if(nbytes > 0)
- printf("Warning: Read partial frame. Check your width & height!\n");
- break;
- }
-
- ptr += img->stride[plane];
- }
- if (!res)
- break;
- }
-
- return res;
-}
-
-static void write_ivf_file_header(FILE *outfile,
- const vpx_codec_enc_cfg_t *cfg,
- int frame_cnt) {
- char header[32];
-
- if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
- return;
- header[0] = 'D';
- header[1] = 'K';
- header[2] = 'I';
- header[3] = 'F';
- mem_put_le16(header+4, 0); /* version */
- mem_put_le16(header+6, 32); /* headersize */
- mem_put_le32(header+8, fourcc); /* headersize */
- mem_put_le16(header+12, cfg->g_w); /* width */
- mem_put_le16(header+14, cfg->g_h); /* height */
- mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
- mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
- mem_put_le32(header+24, frame_cnt); /* length */
- mem_put_le32(header+28, 0); /* unused */
-
- (void) fwrite(header, 1, 32, outfile);
-}
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
-static void write_ivf_frame_header(FILE *outfile,
- const vpx_codec_cx_pkt_t *pkt)
-{
- char header[12];
- vpx_codec_pts_t pts;
+#include "./tools_common.h"
+#include "./video_writer.h"
- if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
- return;
+// The input video frame is downsampled several times to generate a
+// multi-level hierarchical structure. kNumEncoders is defined as the number
+// of encoding levels required. For example, if the size of input video is
+// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
+// outputs 3 bitstreams with resolution of 1280x720(level 0),
+// 640x360(level 1), and 320x180(level 2) respectively.
+#define kNumEncoders 3
- pts = pkt->data.frame.pts;
- mem_put_le32(header, pkt->data.frame.sz);
- mem_put_le32(header+4, pts&0xFFFFFFFF);
- mem_put_le32(header+8, pts >> 32);
+static const char *exec_name;
- (void) fwrite(header, 1, 12, outfile);
+void usage_exit() {
+ fprintf(stderr,
+ "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
}
-int main(int argc, char **argv)
-{
- FILE *infile, *outfile[NUM_ENCODERS];
- vpx_codec_ctx_t codec[NUM_ENCODERS];
- vpx_codec_enc_cfg_t cfg[NUM_ENCODERS];
- vpx_codec_pts_t frame_cnt = 0;
- vpx_image_t raw[NUM_ENCODERS];
- vpx_codec_err_t res[NUM_ENCODERS];
-
- int i;
- long width;
- long height;
- int frame_avail;
- int got_data;
- int flags = 0;
-
- /*Currently, only realtime mode is supported in multi-resolution encoding.*/
- int arg_deadline = VPX_DL_REALTIME;
-
- /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
- don't need to know PSNR, which will skip PSNR calculation and save
- encoding time. */
- int show_psnr = 0;
- uint64_t psnr_sse_total[NUM_ENCODERS] = {0};
- uint64_t psnr_samples_total[NUM_ENCODERS] = {0};
- double psnr_totals[NUM_ENCODERS][4] = {{0,0}};
- int psnr_count[NUM_ENCODERS] = {0};
-
- /* Set the required target bitrates for each resolution level.
- * If target bitrate for highest-resolution level is set to 0,
- * (i.e. target_bitrate[0]=0), we skip encoding at that level.
- */
- unsigned int target_bitrate[NUM_ENCODERS]={1000, 500, 100};
- /* Enter the frame rate of the input video */
- int framerate = 30;
- /* Set down-sampling factor for each resolution level.
- dsf[0] controls down sampling from level 0 to level 1;
- dsf[1] controls down sampling from level 1 to level 2;
- dsf[2] is not used. */
- vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
-
- if(argc!= (5+NUM_ENCODERS))
- die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
- argv[0]);
-
- printf("Using %s\n",vpx_codec_iface_name(interface));
-
- width = strtol(argv[1], NULL, 0);
- height = strtol(argv[2], NULL, 0);
-
- if(width < 16 || width%2 || height <16 || height%2)
- die("Invalid resolution: %ldx%ld", width, height);
-
- /* Open input video file for encoding */
- if(!(infile = fopen(argv[3], "rb")))
- die("Failed to open %s for reading", argv[3]);
-
- /* Open output file for each encoder to output bitstreams */
- for (i=0; i< NUM_ENCODERS; i++)
- {
- if(!target_bitrate[i])
- {
- outfile[i] = NULL;
- continue;
- }
-
- if(!(outfile[i] = fopen(argv[i+4], "wb")))
- die("Failed to open %s for writing", argv[i+4]);
- }
-
- show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
-
- /* Populate default encoder configuration */
- for (i=0; i< NUM_ENCODERS; i++)
- {
- res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
- if(res[i]) {
- printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
- return EXIT_FAILURE;
- }
+int main(int argc, char *argv[]) {
+ int frame_cnt = 0;
+ FILE *infile = NULL;
+ VpxVideoWriter *writers[kNumEncoders];
+ vpx_codec_ctx_t codec[kNumEncoders];
+ vpx_codec_enc_cfg_t cfg[kNumEncoders];
+ vpx_image_t raw[kNumEncoders];
+ const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8");
+ // Currently, only realtime mode is supported in multi-resolution encoding.
+ const int arg_deadline = VPX_DL_REALTIME;
+ int i;
+ int width = 0;
+ int height = 0;
+ int frame_avail = 0;
+ int got_data = 0;
+
+ // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+ // don't need to know PSNR, which will skip PSNR calculation and save
+ // encoding time.
+ int show_psnr = 0;
+ uint64_t psnr_sse_total[kNumEncoders] = {0};
+ uint64_t psnr_samples_total[kNumEncoders] = {0};
+ double psnr_totals[kNumEncoders][4] = {{0, 0}};
+ int psnr_count[kNumEncoders] = {0};
+
+ // Set the required target bitrates for each resolution level.
+ // If target bitrate for highest-resolution level is set to 0,
+ // (i.e. target_bitrate[0]=0), we skip encoding at that level.
+ unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100};
+
+ // Enter the frame rate of the input video.
+ const int framerate = 30;
+ // Set down-sampling factor for each resolution level.
+ // dsf[0] controls down sampling from level 0 to level 1;
+ // dsf[1] controls down sampling from level 1 to level 2;
+ // dsf[2] is not used.
+ vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}};
+
+ exec_name = argv[0];
+
+ if (!encoder)
+ die("Unsupported codec.");
+
+ // exe_name, input width, input height, input file,
+ // output file 1, output file 2, output file 3, psnr on/off
+ if (argc != (5 + kNumEncoders))
+ die("Invalid number of input options.");
+
+ printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+ width = strtol(argv[1], NULL, 0);
+ height = strtol(argv[2], NULL, 0);
+
+ if (width < 16 || width % 2 || height < 16 || height % 2)
+ die("Invalid resolution: %ldx%ld", width, height);
+
+ // Open input video file for encoding
+ if (!(infile = fopen(argv[3], "rb")))
+ die("Failed to open %s for reading", argv[3]);
+
+ show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0);
+
+ // Populate default encoder configuration
+ for (i = 0; i < kNumEncoders; ++i) {
+ vpx_codec_err_t res =
+ vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0);
+ if (res != VPX_CODEC_OK) {
+ printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+ return EXIT_FAILURE;
}
-
- /*
- * Update the default configuration according to needs of the application.
- */
- /* Highest-resolution encoder settings */
- cfg[0].g_w = width;
- cfg[0].g_h = height;
- cfg[0].g_threads = 1; /* number of threads used */
- cfg[0].rc_dropframe_thresh = 30;
- cfg[0].rc_end_usage = VPX_CBR;
- cfg[0].rc_resize_allowed = 0;
- cfg[0].rc_min_quantizer = 4;
- cfg[0].rc_max_quantizer = 56;
- cfg[0].rc_undershoot_pct = 98;
- cfg[0].rc_overshoot_pct = 100;
- cfg[0].rc_buf_initial_sz = 500;
- cfg[0].rc_buf_optimal_sz = 600;
- cfg[0].rc_buf_sz = 1000;
- cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
- cfg[0].g_lag_in_frames = 0;
-
- /* Disable automatic keyframe placement */
- /* Note: These 3 settings are copied to all levels. But, except the lowest
- * resolution level, all other levels are set to VPX_KF_DISABLED internally.
- */
- //cfg[0].kf_mode = VPX_KF_DISABLED;
- cfg[0].kf_mode = VPX_KF_AUTO;
- cfg[0].kf_min_dist = 3000;
- cfg[0].kf_max_dist = 3000;
-
- cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */
- cfg[0].g_timebase.num = 1; /* Set fps */
- cfg[0].g_timebase.den = framerate;
-
- /* Other-resolution encoder settings */
- for (i=1; i< NUM_ENCODERS; i++)
+ }
+
+ // Update the default configuration according to needs of the application.
+ // Highest-resolution encoder settings
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].g_threads = 1;
+ cfg[0].rc_dropframe_thresh = 30;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 4;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 98;
+ cfg[0].rc_overshoot_pct = 100;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ cfg[0].g_error_resilient = 1;
+ cfg[0].g_lag_in_frames = 0;
+ cfg[0].kf_mode = VPX_KF_AUTO; // VPX_KF_DISABLED
+ cfg[0].kf_min_dist = 3000;
+ cfg[0].kf_max_dist = 3000;
+ cfg[0].rc_target_bitrate = target_bitrate[0];
+ cfg[0].g_timebase.num = 1;
+ cfg[0].g_timebase.den = framerate;
+
+ // Other-resolution encoder settings
+ for (i = 1; i < kNumEncoders; ++i) {
+ cfg[i] = cfg[0];
+ cfg[i].g_threads = 1;
+ cfg[i].rc_target_bitrate = target_bitrate[i];
+
+ // Note: Width & height of other-resolution encoders are calculated
+ // from the highest-resolution encoder's size and the corresponding
+ // down_sampling_factor.
{
- memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
-
- cfg[i].g_threads = 1; /* number of threads used */
- cfg[i].rc_target_bitrate = target_bitrate[i];
-
- /* Note: Width & height of other-resolution encoders are calculated
- * from the highest-resolution encoder's size and the corresponding
- * down_sampling_factor.
- */
- {
- unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
- unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
- cfg[i].g_w = iw/dsf[i-1].num;
- cfg[i].g_h = ih/dsf[i-1].num;
- }
-
- /* Make width & height to be multiplier of 2. */
- // Should support odd size ???
- if((cfg[i].g_w)%2)cfg[i].g_w++;
- if((cfg[i].g_h)%2)cfg[i].g_h++;
+ unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1;
+ unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1;
+ cfg[i].g_w = iw / dsf[i - 1].num;
+ cfg[i].g_h = ih / dsf[i - 1].num;
}
- /* Allocate image for each encoder */
- for (i=0; i< NUM_ENCODERS; i++)
- if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
- die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
-
- if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
- read_frame_p = read_frame;
- else
- read_frame_p = read_frame_by_row;
-
- for (i=0; i< NUM_ENCODERS; i++)
- if(outfile[i])
- write_ivf_file_header(outfile[i], &cfg[i], 0);
-
- /* Initialize multi-encoder */
- if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
- (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
- die_codec(&codec[0], "Failed to initialize encoder");
-
- /* The extra encoding configuration parameters can be set as follows. */
- /* Set encoding speed */
- for ( i=0; i<NUM_ENCODERS; i++)
- {
- int speed = -6;
- if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
- die_codec(&codec[i], "Failed to set cpu_used");
+ // Make width & height to be multiplier of 2.
+ if ((cfg[i].g_w) % 2)
+ cfg[i].g_w++;
+
+ if ((cfg[i].g_h) % 2)
+ cfg[i].g_h++;
+ }
+
+ // Open output file for each encoder to output bitstreams
+ for (i = 0; i < kNumEncoders; ++i) {
+ VpxVideoInfo info = {
+ encoder->fourcc,
+ cfg[i].g_w,
+ cfg[i].g_h,
+ {cfg[i].g_timebase.num, cfg[i].g_timebase.den}
+ };
+
+ if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info)))
+ die("Failed to open %s for writing", argv[i+4]);
+ }
+
+ // Allocate image for each encoder
+ for (i = 0; i < kNumEncoders; ++i)
+ if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+ die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+ // Initialize multi-encoder
+ if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0],
+ kNumEncoders,
+ show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0]))
+ die_codec(&codec[0], "Failed to initialize encoder");
+
+ // The extra encoding configuration parameters can be set as follows.
+ for (i = 0; i < kNumEncoders; i++) {
+ // Set encoding speed
+ if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6))
+ die_codec(&codec[i], "Failed to set cpu_used");
+
+ // Set static threshold.
+ if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+ die_codec(&codec[i], "Failed to set static threshold");
+
+ // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING
+ // Enable denoising for the highest-resolution encoder.
+ if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0))
+ die_codec(&codec[0], "Failed to set noise_sensitivity");
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ while (frame_avail || got_data) {
+ vpx_codec_iter_t iter[kNumEncoders] = {NULL};
+ const vpx_codec_cx_pkt_t *pkt[kNumEncoders];
+
+ frame_avail = vpx_img_read(&raw[0], infile);
+
+ if (frame_avail) {
+ for (i = 1; i < kNumEncoders; ++i) {
+ vpx_image_t *const prev = &raw[i - 1];
+
+ // Scale the image down a number of times by downsampling factor
+ // FilterMode 1 or 2 give better psnr than FilterMode 0.
+ I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y],
+ prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U],
+ prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V],
+ prev->d_w, prev->d_h,
+ raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+ raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+ raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+ raw[i].d_w, raw[i].d_h, 1);
+ }
}
- /* Set static threshold. */
- for ( i=0; i<NUM_ENCODERS; i++)
- {
- unsigned int static_thresh = 1;
- if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
- die_codec(&codec[i], "Failed to set static threshold");
+ // Encode frame.
+ if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+ frame_cnt, 1, 0, arg_deadline)) {
+ die_codec(&codec[0], "Failed to encode frame");
}
- /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
- /* Enable denoising for the highest-resolution encoder. */
- if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
- die_codec(&codec[0], "Failed to set noise_sensitivity");
- for ( i=1; i< NUM_ENCODERS; i++)
- {
- if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
- die_codec(&codec[i], "Failed to set noise_sensitivity");
- }
-
-
- frame_avail = 1;
- got_data = 0;
-
- while(frame_avail || got_data)
- {
- vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
- const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
-
- flags = 0;
- frame_avail = read_frame_p(infile, &raw[0]);
-
- if(frame_avail)
- {
- for ( i=1; i<NUM_ENCODERS; i++)
- {
- /*Scale the image down a number of times by downsampling factor*/
- /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
- I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
- raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
- raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
- raw[i-1].d_w, raw[i-1].d_h,
- raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
- raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
- raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
- raw[i].d_w, raw[i].d_h, 1);
- }
- }
-
- /* Encode each frame at multi-levels */
- if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
- frame_cnt, 1, flags, arg_deadline))
- die_codec(&codec[0], "Failed to encode frame");
-
- for (i=NUM_ENCODERS-1; i>=0 ; i--)
- {
- got_data = 0;
-
- while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
- {
- got_data = 1;
- switch(pkt[i]->kind) {
- case VPX_CODEC_CX_FRAME_PKT:
- write_ivf_frame_header(outfile[i], pkt[i]);
- (void) fwrite(pkt[i]->data.frame.buf, 1,
- pkt[i]->data.frame.sz, outfile[i]);
- break;
- case VPX_CODEC_PSNR_PKT:
- if (show_psnr)
- {
- int j;
-
- psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
- psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
- for (j = 0; j < 4; j++)
- {
- //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
- psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
- }
- psnr_count[i]++;
- }
-
- break;
- default:
- break;
- }
- printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
- && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
- fflush(stdout);
+ for (i = kNumEncoders - 1; i >= 0; i--) {
+ got_data = 0;
+
+ while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) {
+ got_data = 1;
+ switch (pkt[i]->kind) {
+ case VPX_CODEC_CX_FRAME_PKT:
+ vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf,
+ pkt[i]->data.frame.sz, frame_cnt - 1);
+ break;
+ case VPX_CODEC_PSNR_PKT:
+ if (show_psnr) {
+ int j;
+ psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+ psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+ for (j = 0; j < 4; j++)
+ psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+ psnr_count[i]++;
}
+ break;
+ default:
+ break;
}
- frame_cnt++;
+ printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
+ (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+ fflush(stdout);
+ }
+ }
+ frame_cnt++;
+ }
+ printf("\n");
+
+ fclose(infile);
+
+ printf("Processed %d frames.\n", frame_cnt - 1);
+ for (i = 0; i < kNumEncoders; ++i) {
+ // Calculate PSNR and print it out
+ if (show_psnr && psnr_count[i] > 0) {
+ int j;
+ double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+ psnr_sse_total[i]);
+
+ fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+ fprintf(stderr, " %.3lf", ovpsnr);
+ for (j = 0; j < 4; j++)
+ fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
}
- printf("\n");
-
- fclose(infile);
-
- printf("Processed %ld frames.\n",(long int)frame_cnt-1);
- for (i=0; i< NUM_ENCODERS; i++)
- {
- /* Calculate PSNR and print it out */
- if ( (show_psnr) && (psnr_count[i]>0) )
- {
- int j;
- double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
- psnr_sse_total[i]);
-
- fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
-
- fprintf(stderr, " %.3lf", ovpsnr);
- for (j = 0; j < 4; j++)
- {
- fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
- }
- }
-
- if(vpx_codec_destroy(&codec[i]))
- die_codec(&codec[i], "Failed to destroy codec");
-
- vpx_img_free(&raw[i]);
- if(!outfile[i])
- continue;
+ if (vpx_codec_destroy(&codec[i]))
+ die_codec(&codec[i], "Failed to destroy codec");
- /* Try to rewrite the file header with the actual frame count */
- if(!fseek(outfile[i], 0, SEEK_SET))
- write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
- fclose(outfile[i]);
- }
- printf("\n");
+ vpx_img_free(&raw[i]);
+ vpx_video_writer_close(writers[i]);
+ }
+ printf("\n");
- return EXIT_SUCCESS;
+ return EXIT_SUCCESS;
}
diff --git a/source/libvpx/examples/vp9_spatial_svc_encoder.c b/source/libvpx/examples/vp9_spatial_svc_encoder.c
index 223f37e..81d3800 100644
--- a/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -38,8 +38,10 @@ static const arg_def_t timebase_arg =
ARG_DEF("t", "timebase", 1, "timebase (num/den)");
static const arg_def_t bitrate_arg = ARG_DEF(
"b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
-static const arg_def_t layers_arg =
- ARG_DEF("l", "layers", 1, "number of SVC layers");
+static const arg_def_t spatial_layers_arg =
+ ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+ ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
static const arg_def_t kf_dist_arg =
ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
static const arg_def_t scale_factors_arg =
@@ -65,10 +67,11 @@ static const arg_def_t max_bitrate_arg =
static const arg_def_t *svc_args[] = {
&frames_arg, &width_arg, &height_arg,
- &timebase_arg, &bitrate_arg, &skip_frames_arg, &layers_arg,
+ &timebase_arg, &bitrate_arg, &skip_frames_arg, &spatial_layers_arg,
&kf_dist_arg, &scale_factors_arg, &quantizers_arg, &passes_arg,
&pass_arg, &fpf_name_arg, &min_q_arg, &max_q_arg,
- &min_bitrate_arg, &max_bitrate_arg, NULL
+ &min_bitrate_arg, &max_bitrate_arg, &temporal_layers_arg,
+ NULL
};
static const uint32_t default_frames_to_skip = 0;
@@ -79,6 +82,7 @@ static const uint32_t default_timebase_num = 1;
static const uint32_t default_timebase_den = 60;
static const uint32_t default_bitrate = 1000;
static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_temporal_layers = 1;
static const uint32_t default_kf_dist = 100;
typedef struct {
@@ -119,6 +123,7 @@ static void parse_command_line(int argc, const char **argv_,
// initialize SvcContext with parameters that will be passed to vpx_svc_init
svc_ctx->log_level = SVC_LOG_DEBUG;
svc_ctx->spatial_layers = default_spatial_layers;
+ svc_ctx->temporal_layers = default_temporal_layers;
// start with default encoder configuration
res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -156,8 +161,10 @@ static void parse_command_line(int argc, const char **argv_,
enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
} else if (arg_match(&arg, &skip_frames_arg, argi)) {
app_input->frames_to_skip = arg_parse_uint(&arg);
- } else if (arg_match(&arg, &layers_arg, argi)) {
+ } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
svc_ctx->spatial_layers = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+ svc_ctx->temporal_layers = arg_parse_uint(&arg);
} else if (arg_match(&arg, &kf_dist_arg, argi)) {
enc_cfg->kf_min_dist = arg_parse_uint(&arg);
enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
diff --git a/source/libvpx/examples/vpx_temporal_svc_encoder.c b/source/libvpx/examples/vpx_temporal_svc_encoder.c
index 4ec1848..5eac92c 100644
--- a/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/source/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -12,6 +12,7 @@
// encoding scheme based on temporal scalability for video applications
// that benefit from a scalable bitstream.
+#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
@@ -438,7 +439,7 @@ static void set_temporal_layer_pattern(int layering_mode,
}
int main(int argc, char **argv) {
- VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS];
+ VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t cfg;
int frame_cnt = 0;
@@ -456,7 +457,6 @@ int main(int argc, char **argv) {
int layering_mode = 0;
int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
int flag_periodicity = 1;
- int max_intra_size_pct;
vpx_svc_layer_id_t layer_id = {0, 0};
const VpxInterface *encoder = NULL;
FILE *infile = NULL;
@@ -570,6 +570,8 @@ int main(int argc, char **argv) {
outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
if (!outfile[i])
die("Failed to open %s for writing", file_name);
+
+ assert(outfile[i] != NULL);
}
// No spatial layers in this encoder.
cfg.ss_number_layers = 1;
@@ -595,11 +597,11 @@ int main(int argc, char **argv) {
// This controls the maximum target size of the key frame.
// For generating smaller key frames, use a smaller max_intra_size_pct
// value, like 100 or 200.
- max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
- * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
- // For low-quality key frame.
- max_intra_size_pct = 200;
- vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+ {
+ const int max_intra_size_pct = 200;
+ vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ max_intra_size_pct);
+ }
frame_avail = 1;
while (frame_avail || got_data) {
diff --git a/source/libvpx/libs.mk b/source/libvpx/libs.mk
index 25fbc2c..c7c2748 100644
--- a/source/libvpx/libs.mk
+++ b/source/libvpx/libs.mk
@@ -133,6 +133,8 @@ ifeq ($(CONFIG_VP9_DECODER),yes)
CODEC_DOC_SECTIONS += vp9 vp9_decoder
endif
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
ifeq ($(CONFIG_ENCODERS),yes)
CODEC_DOC_SECTIONS += encoder
diff --git a/source/libvpx/test/active_map_test.cc b/source/libvpx/test/active_map_test.cc
index a9bb540..0221995 100644
--- a/source/libvpx/test/active_map_test.cc
+++ b/source/libvpx/test/active_map_test.cc
@@ -38,7 +38,7 @@ class ActiveMapTest
if (video->frame() == 1) {
encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
} else if (video->frame() == 3) {
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = vpx_active_map_t();
uint8_t active_map[9 * 13] = {
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
@@ -57,7 +57,7 @@ class ActiveMapTest
map.active_map = active_map;
encoder->Control(VP8E_SET_ACTIVEMAP, &map);
} else if (video->frame() == 15) {
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = vpx_active_map_t();
map.cols = (kWidth + 15) / 16;
map.rows = (kHeight + 15) / 16;
map.active_map = NULL;
diff --git a/source/libvpx/test/datarate_test.cc b/source/libvpx/test/datarate_test.cc
index 8dcf26c..a3d730a 100644
--- a/source/libvpx/test/datarate_test.cc
+++ b/source/libvpx/test/datarate_test.cc
@@ -42,6 +42,9 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+ }
const vpx_rational_t tb = video->timebase();
timebase_ = static_cast<double>(tb.num) / tb.den;
duration_ = 0;
@@ -120,9 +123,40 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
double file_datarate_;
double effective_datarate_;
size_t bits_in_last_frame_;
+ int denoiser_on_;
};
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 140);
+ for (int j = 1; j < 5; ++j) {
+ // Run over the denoiser levels.
+ // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+ // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+ // denoiserOnAggressive, and denoiserOnAdaptive.
+ // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
+ // refers to the blur thresholds: 20, 40, 60 80.
+ // The j = 0 case (denoiser off) is covered in the tests below.
+ denoiser_on_ = j;
+ cfg_.rc_target_bitrate = 300;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+ << " The datarate for the file missed the target!";
+ }
+}
+
TEST_P(DatarateTestLarge, BasicBufferModel) {
+ denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_dropframe_thresh = 1;
cfg_.rc_max_quantizer = 56;
@@ -154,6 +188,7 @@ TEST_P(DatarateTestLarge, BasicBufferModel) {
}
TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+ denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_max_quantizer = 36;
cfg_.rc_end_usage = VPX_CBR;
diff --git a/source/libvpx/test/dct16x16_test.cc b/source/libvpx/test/dct16x16_test.cc
index ee417ce..c38cc2e 100644
--- a/source/libvpx/test/dct16x16_test.cc
+++ b/source/libvpx/test/dct16x16_test.cc
@@ -268,11 +268,13 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct16x16_ref(const int16_t *in, int16_t *out, int stride,
+ int /*tx_type*/) {
vp9_fdct16x16_c(in, out, stride);
}
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride,
+ int /*tx_type*/) {
vp9_idct16x16_256_add_c(in, dest, stride);
}
diff --git a/source/libvpx/test/dct32x32_test.cc b/source/libvpx/test/dct32x32_test.cc
index 4f34a44..d2d437c 100644
--- a/source/libvpx/test/dct32x32_test.cc
+++ b/source/libvpx/test/dct32x32_test.cc
@@ -37,7 +37,7 @@ static int round(double x) {
const int kNumCoeffs = 1024;
const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
const double kInvSqrt2 = 0.707106781186547524400844362104;
for (int k = 0; k < 32; k++) {
out[k] = 0.0;
@@ -55,7 +55,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
double temp_in[32], temp_out[32];
for (int j = 0; j < 32; ++j)
temp_in[j] = input[j*32 + i];
- reference_32x32_dct_1d(temp_in, temp_out, 1);
+ reference_32x32_dct_1d(temp_in, temp_out);
for (int j = 0; j < 32; ++j)
output[j * 32 + i] = temp_out[j];
}
@@ -64,7 +64,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
double temp_in[32], temp_out[32];
for (int j = 0; j < 32; ++j)
temp_in[j] = output[j + i*32];
- reference_32x32_dct_1d(temp_in, temp_out, 1);
+ reference_32x32_dct_1d(temp_in, temp_out);
// Scale by some magic number
for (int j = 0; j < 32; ++j)
output[j + i * 32] = temp_out[j] / 4;
diff --git a/source/libvpx/test/decode_perf_test.cc b/source/libvpx/test/decode_perf_test.cc
index 11529b3..5a71140 100644
--- a/source/libvpx/test/decode_perf_test.cc
+++ b/source/libvpx/test/decode_perf_test.cc
@@ -74,7 +74,7 @@ TEST_P(DecodePerfTest, PerfTest) {
libvpx_test::WebMVideoSource video(video_name);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = threads;
libvpx_test::VP9Decoder decoder(cfg, 0);
diff --git a/source/libvpx/test/decode_test_driver.cc b/source/libvpx/test/decode_test_driver.cc
index 99610eb..0ef4f7b 100644
--- a/source/libvpx/test/decode_test_driver.cc
+++ b/source/libvpx/test/decode_test_driver.cc
@@ -106,7 +106,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
}
void DecoderTest::RunLoop(CompressedVideoSource *video) {
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
RunLoop(video, dec_cfg);
}
diff --git a/source/libvpx/test/decode_test_driver.h b/source/libvpx/test/decode_test_driver.h
index 1f73c7d..a757b59 100644
--- a/source/libvpx/test/decode_test_driver.h
+++ b/source/libvpx/test/decode_test_driver.h
@@ -125,20 +125,20 @@ class DecoderTest {
const vpx_codec_dec_cfg_t &dec_cfg);
// Hook to be called before decompressing every frame.
- virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
- Decoder *decoder) {}
+ virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+ Decoder* /*decoder*/) {}
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const CompressedVideoSource& /* video */,
+ const CompressedVideoSource& /*video*/,
Decoder *decoder) {
EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
return VPX_CODEC_OK == res_dec;
}
// Hook to be called on every decompressed frame.
- virtual void DecompressedFrameHook(const vpx_image_t& img,
- const unsigned int frame_number) {}
+ virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+ const unsigned int /*frame_number*/) {}
// Hook to be called on peek result
virtual void HandlePeekResult(Decoder* const decoder,
diff --git a/source/libvpx/test/encode_test_driver.cc b/source/libvpx/test/encode_test_driver.cc
index 6d4281d..9702ddf 100644
--- a/source/libvpx/test/encode_test_driver.cc
+++ b/source/libvpx/test/encode_test_driver.cc
@@ -133,13 +133,13 @@ static bool compare_img(const vpx_image_t *img1,
return match;
}
-void EncoderTest::MismatchHook(const vpx_image_t *img1,
- const vpx_image_t *img2) {
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+ const vpx_image_t* /*img2*/) {
ASSERT_TRUE(0) << "Encode/Decode mismatch found";
}
void EncoderTest::RunLoop(VideoSource *video) {
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
stats_.Reset();
diff --git a/source/libvpx/test/encode_test_driver.h b/source/libvpx/test/encode_test_driver.h
index 2270ce2..a77bd64 100644
--- a/source/libvpx/test/encode_test_driver.h
+++ b/source/libvpx/test/encode_test_driver.h
@@ -189,20 +189,21 @@ class EncoderTest {
virtual void RunLoop(VideoSource *video);
// Hook to be called at the beginning of a pass.
- virtual void BeginPassHook(unsigned int pass) {}
+ virtual void BeginPassHook(unsigned int /*pass*/) {}
// Hook to be called at the end of a pass.
virtual void EndPassHook() {}
// Hook to be called before encoding a frame.
- virtual void PreEncodeFrameHook(VideoSource *video) {}
- virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+ virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+ virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+ Encoder* /*encoder*/) {}
// Hook to be called on every compressed data packet.
- virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
// Hook to be called on every PSNR packet.
- virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+ virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
// Hook to determine whether the encode loop should continue.
virtual bool Continue() const {
@@ -218,19 +219,19 @@ class EncoderTest {
const vpx_image_t *img2);
// Hook to be called on every decompressed frame.
- virtual void DecompressedFrameHook(const vpx_image_t& img,
- vpx_codec_pts_t pts) {}
+ virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+ vpx_codec_pts_t /*pts*/) {}
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const VideoSource& /* video */,
+ const VideoSource& /*video*/,
Decoder *decoder) {
EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
return VPX_CODEC_OK == res_dec;
}
// Hook that can modify the encoder's output data
- virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+ virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
const vpx_codec_cx_pkt_t *pkt) {
return pkt;
}
diff --git a/source/libvpx/test/external_frame_buffer_test.cc b/source/libvpx/test/external_frame_buffer_test.cc
index fb0449d..44eba33 100644
--- a/source/libvpx/test/external_frame_buffer_test.cc
+++ b/source/libvpx/test/external_frame_buffer_test.cc
@@ -285,7 +285,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
video_->Init();
video_->Begin();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
ASSERT_TRUE(decoder_ != NULL);
}
diff --git a/source/libvpx/test/fdct4x4_test.cc b/source/libvpx/test/fdct4x4_test.cc
index 7c48260..08a69ab 100644
--- a/source/libvpx/test/fdct4x4_test.cc
+++ b/source/libvpx/test/fdct4x4_test.cc
@@ -40,7 +40,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
vp9_fdct4x4_c(in, out, stride);
}
@@ -48,7 +48,7 @@ void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_fht4x4_c(in, out, stride, tx_type);
}
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
vp9_fwht4x4_c(in, out, stride);
}
diff --git a/source/libvpx/test/fdct8x8_test.cc b/source/libvpx/test/fdct8x8_test.cc
index 567e5f6..a694f0c 100644
--- a/source/libvpx/test/fdct8x8_test.cc
+++ b/source/libvpx/test/fdct8x8_test.cc
@@ -39,7 +39,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
vp9_fdct8x8_c(in, out, stride);
}
diff --git a/source/libvpx/test/frame_size_tests.cc b/source/libvpx/test/frame_size_tests.cc
index db27975..1c9a522 100644
--- a/source/libvpx/test/frame_size_tests.cc
+++ b/source/libvpx/test/frame_size_tests.cc
@@ -27,7 +27,7 @@ class VP9FrameSizeTestsLarge
}
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const libvpx_test::VideoSource &video,
+ const libvpx_test::VideoSource& /*video*/,
libvpx_test::Decoder *decoder) {
EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
return !::testing::Test::HasFailure();
diff --git a/source/libvpx/test/intrapred_test.cc b/source/libvpx/test/intrapred_test.cc
index ead4760..f0d9c34 100644
--- a/source/libvpx/test/intrapred_test.cc
+++ b/source/libvpx/test/intrapred_test.cc
@@ -294,6 +294,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
::testing::Values(
vp8_build_intra_predictors_mby_s_ssse3));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mby_s_neon));
+#endif
typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
uint8_t *uabove_row,
@@ -382,5 +387,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
::testing::Values(
vp8_build_intra_predictors_mbuv_s_ssse3));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mbuv_s_neon));
+#endif
} // namespace
diff --git a/source/libvpx/test/invalid_file_test.cc b/source/libvpx/test/invalid_file_test.cc
index 0a1c17c..50e7c23 100644
--- a/source/libvpx/test/invalid_file_test.cc
+++ b/source/libvpx/test/invalid_file_test.cc
@@ -73,7 +73,7 @@ class InvalidFileTest
void RunTest() {
const DecodeParam input = GET_PARAM(1);
libvpx_test::CompressedVideoSource *video = NULL;
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = input.threads;
const std::string filename = input.filename;
@@ -113,9 +113,12 @@ TEST_P(InvalidFileTest, ReturnCode) {
const DecodeParam kVP9InvalidFileTests[] = {
{1, "invalid-vp90-02-v2.webm"},
{1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
- {1, "invalid-vp90-03-v2.webm"},
+ {1, "invalid-vp90-03-v3.webm"},
{1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
{1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+ {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+ {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+ {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf"},
};
VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -126,9 +129,9 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
class InvalidFileInvalidPeekTest : public InvalidFileTest {
protected:
InvalidFileInvalidPeekTest() : InvalidFileTest() {}
- virtual void HandlePeekResult(libvpx_test::Decoder *const decoder,
- libvpx_test::CompressedVideoSource *video,
- const vpx_codec_err_t res_peek) {}
+ virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+ libvpx_test::CompressedVideoSource* /*video*/,
+ const vpx_codec_err_t /*res_peek*/) {}
};
TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
@@ -144,6 +147,10 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
{4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+ {4, "invalid-"
+ "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+ {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf"},
+ {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf"},
};
INSTANTIATE_TEST_CASE_P(
diff --git a/source/libvpx/test/md5_helper.h b/source/libvpx/test/md5_helper.h
index dc95582..1db712b 100644
--- a/source/libvpx/test/md5_helper.h
+++ b/source/libvpx/test/md5_helper.h
@@ -28,7 +28,8 @@ class MD5 {
// plane, we never want to round down and thus skip a pixel so if
// we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
// This works only for chroma_shift of 0 and 1.
- const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytes_per_sample =
+ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
const int h = plane ? (img->d_h + img->y_chroma_shift) >>
img->y_chroma_shift : img->d_h;
const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
diff --git a/source/libvpx/test/resize_test.cc b/source/libvpx/test/resize_test.cc
index 8d08f1e..9d0c570 100644
--- a/source/libvpx/test/resize_test.cc
+++ b/source/libvpx/test/resize_test.cc
@@ -211,8 +211,8 @@ class ResizeInternalTest : public ResizeTest {
EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
}
- virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
#if WRITE_COMPRESSED_STREAM
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
++out_frames_;
// Write initial file header if first frame.
@@ -222,8 +222,8 @@ class ResizeInternalTest : public ResizeTest {
// Write frame header and data.
write_ivf_frame_header(pkt, outfile_);
(void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
}
+#endif
double frame0_psnr_;
#if WRITE_COMPRESSED_STREAM
diff --git a/source/libvpx/test/sad_test.cc b/source/libvpx/test/sad_test.cc
index e63770b..5377c1e 100644
--- a/source/libvpx/test/sad_test.cc
+++ b/source/libvpx/test/sad_test.cc
@@ -505,21 +505,6 @@ const SadMxNParam mmx_tests[] = {
INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
#endif // CONFIG_VP8_ENCODER
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const SadMxNVp9Param mmx_vp9_tests[] = {
- make_tuple(16, 16, sad_16x16_mmx_vp9),
- make_tuple(8, 16, sad_8x16_mmx_vp9),
- make_tuple(16, 8, sad_16x8_mmx_vp9),
- make_tuple(8, 8, sad_8x8_mmx_vp9),
- make_tuple(4, 4, sad_4x4_mmx_vp9),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_MMX
#if HAVE_SSE
diff --git a/source/libvpx/test/svc_test.cc b/source/libvpx/test/svc_test.cc
index 1cb01a4..218f53d 100644
--- a/source/libvpx/test/svc_test.cc
+++ b/source/libvpx/test/svc_test.cc
@@ -60,7 +60,7 @@ class SvcTest : public ::testing::Test {
codec_enc_.kf_min_dist = 100;
codec_enc_.kf_max_dist = 100;
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP9CodecFactory codec_factory;
decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
}
@@ -112,7 +112,7 @@ class SvcTest : public ::testing::Test {
video.Next();
}
- // Flush encoder and test EOS packet
+ // Flush encoder and test EOS packet.
res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
video.duration(), VPX_DL_GOOD_QUALITY);
stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
@@ -135,7 +135,7 @@ class SvcTest : public ::testing::Test {
EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
}
- outputs[*frame_received].buf = malloc(frame_size);
+ outputs[*frame_received].buf = malloc(frame_size + 16);
ASSERT_TRUE(outputs[*frame_received].buf != NULL);
memcpy(outputs[*frame_received].buf, vpx_svc_get_buffer(&svc_),
frame_size);
@@ -176,13 +176,13 @@ class SvcTest : public ::testing::Test {
video.Next();
}
- // Flush Encoder
+ // Flush encoder.
res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
video.duration(), VPX_DL_GOOD_QUALITY);
EXPECT_EQ(VPX_CODEC_OK, res);
StoreFrames(n, outputs, &frame_received);
- EXPECT_EQ(frame_received, (size_t)n);
+ EXPECT_EQ(frame_received, static_cast<size_t>(n));
ReleaseEncoder();
}
@@ -204,7 +204,7 @@ class SvcTest : public ::testing::Test {
++decoded_frames;
DxDataIterator dec_iter = decoder_->GetDxData();
- while (dec_iter.Next()) {
+ while (dec_iter.Next() != NULL) {
++received_frames;
}
}
@@ -212,12 +212,13 @@ class SvcTest : public ::testing::Test {
EXPECT_EQ(received_frames, n);
}
- void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
- const int num_super_frames,
- const int remained_layers) {
+ void DropLayersAndMakeItVP9Comaptible(struct vpx_fixed_buf *const inputs,
+ const int num_super_frames,
+ const int remained_spatial_layers,
+ const bool is_multiple_frame_contexts) {
ASSERT_TRUE(inputs != NULL);
ASSERT_GT(num_super_frames, 0);
- ASSERT_GT(remained_layers, 0);
+ ASSERT_GT(remained_spatial_layers, 0);
for (int i = 0; i < num_super_frames; ++i) {
uint32_t frame_sizes[8] = {0};
@@ -233,34 +234,112 @@ class SvcTest : public ::testing::Test {
NULL, NULL);
ASSERT_EQ(VPX_CODEC_OK, res);
- uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
- uint8_t *frame_start = frame_data;
- for (frame = 0; frame < frame_count; ++frame) {
- // Looking for a visible frame
- if (frame_data[0] & 0x02) {
- ++frames_found;
- if (frames_found == remained_layers)
- break;
+ if (frame_count == 0) {
+ // There's no super frame but only a single frame.
+ ASSERT_EQ(1, remained_spatial_layers);
+ if (is_multiple_frame_contexts) {
+ // Make a new super frame.
+ uint8_t marker = 0xc1;
+ unsigned int mask;
+ int mag;
+
+ // Choose the magnitude.
+ for (mag = 0, mask = 0xff; mag < 4; ++mag) {
+ if (inputs[i].sz < mask)
+ break;
+ mask <<= 8;
+ mask |= 0xff;
+ }
+ marker |= mag << 3;
+ int index_sz = 2 + (mag + 1) * 2;
+
+ inputs[i].buf = realloc(inputs[i].buf, inputs[i].sz + index_sz + 16);
+ ASSERT_TRUE(inputs[i].buf != NULL);
+ uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+ frame_data[0] &= ~2; // Set the show_frame flag to 0.
+ frame_data += inputs[i].sz;
+ // Add an one byte frame with show_existing_frame.
+ *frame_data++ = 0x88;
+
+ // Write the super frame index.
+ *frame_data++ = marker;
+
+ frame_sizes[0] = inputs[i].sz;
+ frame_sizes[1] = 1;
+ for (int j = 0; j < 2; ++j) {
+ unsigned int this_sz = frame_sizes[j];
+ for (int k = 0; k <= mag; k++) {
+ *frame_data++ = this_sz & 0xff;
+ this_sz >>= 8;
+ }
+ }
+ *frame_data++ = marker;
+ inputs[i].sz += index_sz + 1;
}
+ } else {
+ // Found a super frame.
+ uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+ uint8_t *frame_start = frame_data;
+ for (frame = 0; frame < frame_count; ++frame) {
+ // Looking for a visible frame.
+ if (frame_data[0] & 0x02) {
+ ++frames_found;
+ if (frames_found == remained_spatial_layers)
+ break;
+ }
+ frame_data += frame_sizes[frame];
+ }
+ ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+ << "remained_spatial_layers: " << remained_spatial_layers
+ << " super_frame: " << i
+ << " is_multiple_frame_context: " << is_multiple_frame_contexts;
+ if (frame == frame_count - 1 && !is_multiple_frame_contexts)
+ continue;
+
frame_data += frame_sizes[frame];
+
+ // We need to add one more frame for multiple frame contexts.
+ if (is_multiple_frame_contexts)
+ ++frame;
+ uint8_t marker =
+ static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
+ const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+ const size_t index_sz = 2 + mag * frame_count;
+ const size_t new_index_sz = 2 + mag * (frame + 1);
+ marker &= 0x0f8;
+ marker |= frame;
+
+ // Copy existing frame sizes.
+ memmove(frame_data + (is_multiple_frame_contexts ? 2 : 1),
+ frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2);
+ if (is_multiple_frame_contexts) {
+ // Add a one byte frame with flag show_existing_frame.
+ *frame_data++ = 0x88 | (remained_spatial_layers - 1);
+ }
+ // New marker.
+ frame_data[0] = marker;
+ frame_data += (mag * (frame + 1) + 1);
+
+ if (is_multiple_frame_contexts) {
+ // Write the frame size for the one byte frame.
+ frame_data -= mag;
+ *frame_data++ = 1;
+ for (uint32_t j = 1; j < mag; ++j) {
+ *frame_data++ = 0;
+ }
+ }
+
+ *frame_data++ = marker;
+ inputs[i].sz = frame_data - frame_start;
+
+ if (is_multiple_frame_contexts) {
+ // Change the show frame flag to 0 for all frames.
+ for (int j = 0; j < frame; ++j) {
+ frame_start[0] &= ~2;
+ frame_start += frame_sizes[j];
+ }
+ }
}
- ASSERT_LT(frame, frame_count);
- if (frame == frame_count - 1)
- continue;
-
- frame_data += frame_sizes[frame];
- uint8_t marker =
- static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
- const uint32_t mag = ((marker >> 3) & 0x3) + 1;
- const size_t index_sz = 2 + mag * frame_count;
- const size_t new_index_sz = 2 + mag * (frame + 1);
- marker &= 0x0f8;
- marker |= frame;
- frame_data[0] = marker;
- memcpy(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
- new_index_sz - 2);
- frame_data[new_index_sz - 1] = marker;
- inputs[i].sz = frame_data - frame_start + new_index_sz;
}
}
@@ -326,7 +405,7 @@ TEST_F(SvcTest, InvalidOptions) {
}
TEST_F(SvcTest, SetLayersOption) {
- vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
EXPECT_EQ(VPX_CODEC_OK, res);
InitializeEncoder();
EXPECT_EQ(3, svc_.spatial_layers);
@@ -334,7 +413,7 @@ TEST_F(SvcTest, SetLayersOption) {
TEST_F(SvcTest, SetMultipleOptions) {
vpx_codec_err_t res =
- vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
+ vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
EXPECT_EQ(VPX_CODEC_OK, res);
InitializeEncoder();
EXPECT_EQ(2, svc_.spatial_layers);
@@ -496,7 +575,7 @@ TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
FreeBitstreamBuffers(&outputs[0], 20);
}
-TEST_F(SvcTest, TwoPassEncode2LayersDecodeBaseLayerOnly) {
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
// First pass encode
std::string stats_buf;
Pass1EncodeNFrames(10, 2, &stats_buf);
@@ -507,12 +586,12 @@ TEST_F(SvcTest, TwoPassEncode2LayersDecodeBaseLayerOnly) {
vpx_fixed_buf outputs[10];
memset(&outputs[0], 0, sizeof(outputs));
Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
- DropEnhancementLayers(&outputs[0], 10, 1);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
DecodeNFrames(&outputs[0], 10);
FreeBitstreamBuffers(&outputs[0], 10);
}
-TEST_F(SvcTest, TwoPassEncode5LayersDecode54321Layers) {
+TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
// First pass encode
std::string stats_buf;
Pass1EncodeNFrames(10, 5, &stats_buf);
@@ -525,13 +604,13 @@ TEST_F(SvcTest, TwoPassEncode5LayersDecode54321Layers) {
Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
DecodeNFrames(&outputs[0], 10);
- DropEnhancementLayers(&outputs[0], 10, 4);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 4, false);
DecodeNFrames(&outputs[0], 10);
- DropEnhancementLayers(&outputs[0], 10, 3);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 3, false);
DecodeNFrames(&outputs[0], 10);
- DropEnhancementLayers(&outputs[0], 10, 2);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, false);
DecodeNFrames(&outputs[0], 10);
- DropEnhancementLayers(&outputs[0], 10, 1);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
DecodeNFrames(&outputs[0], 10);
FreeBitstreamBuffers(&outputs[0], 10);
@@ -568,12 +647,212 @@ TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
memset(&outputs[0], 0, sizeof(outputs));
Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
DecodeNFrames(&outputs[0], 20);
- DropEnhancementLayers(&outputs[0], 20, 2);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 2, false);
DecodeNFrames(&outputs[0], 20);
- DropEnhancementLayers(&outputs[0], 20, 1);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 1, false);
DecodeNFrames(&outputs[0], 20);
FreeBitstreamBuffers(&outputs[0], 20);
}
+TEST_F(SvcTest, SetMultipleFrameContextsOption) {
+ svc_.spatial_layers = 5;
+ vpx_codec_err_t res =
+ vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+ EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+ svc_.spatial_layers = 2;
+ res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+ InitializeEncoder();
+}
+
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+ Pass1EncodeNFrames(10, 3, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+ vpx_fixed_buf outputs_new[10];
+ for (int i = 0; i < 10; ++i) {
+ outputs_new[i].buf = malloc(outputs[i].sz + 16);
+ ASSERT_TRUE(outputs_new[i].buf != NULL);
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
+ }
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 3, true);
+ DecodeNFrames(&outputs_new[0], 10);
+
+ for (int i = 0; i < 10; ++i) {
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
+ }
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 2, true);
+ DecodeNFrames(&outputs_new[0], 10);
+
+ for (int i = 0; i < 10; ++i) {
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
+ }
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 1, true);
+ DecodeNFrames(&outputs_new[0], 10);
+
+ FreeBitstreamBuffers(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs_new[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+ vpx_fixed_buf base_layer[5];
+ for (int i = 0; i < 5; ++i)
+ base_layer[i] = outputs[i * 2];
+
+ DecodeNFrames(&base_layer[0], 5);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+
+ vpx_fixed_buf base_layer[5];
+ for (int i = 0; i < 5; ++i)
+ base_layer[i] = outputs[i * 2];
+
+ DecodeNFrames(&base_layer[0], 5);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
} // namespace
diff --git a/source/libvpx/test/test-data.sha1 b/source/libvpx/test/test-data.sha1
index ee6289f..84b13f9 100644
--- a/source/libvpx/test/test-data.sha1
+++ b/source/libvpx/test/test-data.sha1
@@ -10,8 +10,8 @@ fe346136b9b8c1e6f6084cc106485706915795e4 invalid-vp90-01-v2.webm
25751f5d3b05ff03f0719ad42cd625348eb8961e invalid-vp90-01-v2.webm.res
d78e2fceba5ac942246503ec8366f879c4775ca5 invalid-vp90-02-v2.webm
8e2eff4af87d2b561cce2365713269e301457ef3 invalid-vp90-02-v2.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v2.webm
-25dd58c22d23f75304d7ce7f69f4e5b02ef9119a invalid-vp90-03-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612 invalid-vp90-03-v3.webm.res
d637297561dd904eb2c97a9015deeb31c4a1e8d2 invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
3a204bdbeaa3c6458b77bcebb8366d107267f55d invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
a432f96ff0a787268e2f94a8092ab161a18d1b06 park_joy_90p_10_420.y4m
@@ -681,3 +681,19 @@ e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv
c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5
717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m
+b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32 invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf
+22e0ee8babe574722baf4ef6d7ff5d7cf80d386c invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf
+d3ea592c8d7b05d14c7ed48befc0a3aaf7709b7a invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res
+65e93f9653bcf65b022f7d225268d1a90a76e7bb vp90-2-19-skip.webm
+368dccdde5288c13c25695d2eacdc7402cadf613 vp90-2-19-skip.webm.md5
+ffe460282df2b0e7d4603c2158653ad96f574b02 vp90-2-19-skip-01.webm
+bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 vp90-2-19-skip-01.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
+fb79dcbbbb8c82d5a750e339acce66e39a32f15f invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
diff --git a/source/libvpx/test/test.mk b/source/libvpx/test/test.mk
index 0814c2b..c839c92 100644
--- a/source/libvpx/test/test.mk
+++ b/source/libvpx/test/test.mk
@@ -785,6 +785,10 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
@@ -793,16 +797,28 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
# BBB VP9 streams
diff --git a/source/libvpx/test/test_vectors.cc b/source/libvpx/test/test_vectors.cc
index dbdbdd6..cccebf8 100644
--- a/source/libvpx/test/test_vectors.cc
+++ b/source/libvpx/test/test_vectors.cc
@@ -181,7 +181,8 @@ const char *const kVP9TestVectors[] = {
"vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
"vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
"vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
- "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm",
+ "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
+ "vp90-2-19-skip-01.webm", "vp91-2-04-yuv444.webm",
};
const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
#endif // CONFIG_VP9_DECODER
diff --git a/source/libvpx/test/tile_independence_test.cc b/source/libvpx/test/tile_independence_test.cc
index d714452..b9f879d 100644
--- a/source/libvpx/test/tile_independence_test.cc
+++ b/source/libvpx/test/tile_independence_test.cc
@@ -29,7 +29,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
md5_inv_order_(),
n_tiles_(GET_PARAM(1)) {
init_flags_ = VPX_CODEC_USE_PSNR;
- vpx_codec_dec_cfg_t cfg;
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.w = 704;
cfg.h = 144;
cfg.threads = 1;
diff --git a/source/libvpx/test/user_priv_test.cc b/source/libvpx/test/user_priv_test.cc
index 22fce85..8512d88 100644
--- a/source/libvpx/test/user_priv_test.cc
+++ b/source/libvpx/test/user_priv_test.cc
@@ -47,7 +47,7 @@ string DecodeFile(const string &filename) {
libvpx_test::WebMVideoSource video(filename);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
libvpx_test::VP9Decoder decoder(cfg, 0);
libvpx_test::MD5 md5;
diff --git a/source/libvpx/test/variance_test.cc b/source/libvpx/test/variance_test.cc
index 7d81182..f76402e 100644
--- a/source/libvpx/test/variance_test.cc
+++ b/source/libvpx/test/variance_test.cc
@@ -35,6 +35,14 @@ using ::std::tr1::make_tuple;
using ::std::tr1::tuple;
using libvpx_test::ACMRandom;
+static unsigned int mb_ss_ref(const int16_t *src) {
+ unsigned int res = 0;
+ for (int i = 0; i < 256; ++i) {
+ res += src[i] * src[i];
+ }
+ return res;
+}
+
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, unsigned int *sse_ptr) {
int se = 0;
@@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+ SumOfSquaresTest() : func_(GetParam()) {}
+
+ virtual ~SumOfSquaresTest() {
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ void ConstTest();
+ void RefTest();
+
+ SumOfSquaresFunction func_;
+ ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+ int16_t mem[256];
+ unsigned int res;
+ for (int v = 0; v < 256; ++v) {
+ for (int i = 0; i < 256; ++i) {
+ mem[i] = v;
+ }
+ ASM_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(256u * (v * v), res);
+ }
+}
+
+void SumOfSquaresTest::RefTest() {
+ int16_t mem[256];
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 256; ++j) {
+ mem[j] = rnd_.Rand8() - rnd_.Rand8();
+ }
+
+ const unsigned int expected = mb_ss_ref(mem);
+ unsigned int res;
+ ASM_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(expected, res);
+ }
+}
+
template<typename VarianceFunctionType>
class VarianceTest
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -88,7 +140,7 @@ class VarianceTest
height_ = 1 << log2height_;
variance_ = get<2>(params);
- rnd(ACMRandom::DeterministicSeed());
+ rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
ref_ = new uint8_t[block_size_];
@@ -107,7 +159,7 @@ class VarianceTest
void RefTest();
void OneQuarterTest();
- ACMRandom rnd;
+ ACMRandom rnd_;
uint8_t* src_;
uint8_t* ref_;
int width_, log2width_;
@@ -135,8 +187,8 @@ template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::RefTest() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
- ref_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -206,7 +258,7 @@ class SubpelVarianceTest
height_ = 1 << log2height_;
subpel_variance_ = get<2>(params);
- rnd(ACMRandom::DeterministicSeed());
+ rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
@@ -226,7 +278,7 @@ class SubpelVarianceTest
protected:
void RefTest();
- ACMRandom rnd;
+ ACMRandom rnd_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
@@ -241,10 +293,10 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -263,11 +315,11 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
- sec_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
+ sec_[j] = rnd_.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P(
namespace vp9 {
#if CONFIG_VP9_ENCODER
+
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+ ::testing::Values(vp9_get_mb_ss_c));
+
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
@@ -485,23 +544,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 5, subpel_avg_variance64x32_c),
make_tuple(6, 6, subpel_avg_variance64x64_c)));
-#if HAVE_MMX
-const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
-const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
-const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;
-const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
-const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
-INSTANTIATE_TEST_CASE_P(
- MMX, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
- make_tuple(3, 3, variance8x8_mmx),
- make_tuple(3, 4, variance8x16_mmx),
- make_tuple(4, 3, variance16x8_mmx),
- make_tuple(4, 4, variance16x16_mmx)));
-#endif
-
#if HAVE_SSE2
#if CONFIG_USE_X86INC
+
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+ ::testing::Values(vp9_get_mb_ss_sse2));
+
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
diff --git a/source/libvpx/test/vp8_decrypt_test.cc b/source/libvpx/test/vp8_decrypt_test.cc
index 470fdf1..972a1d9 100644
--- a/source/libvpx/test/vp8_decrypt_test.cc
+++ b/source/libvpx/test/vp8_decrypt_test.cc
@@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp8) {
libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
video.Init();
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP8Decoder decoder(dec_cfg, 0);
video.Begin();
diff --git a/source/libvpx/test/vp8_multi_resolution_encoder.sh b/source/libvpx/test/vp8_multi_resolution_encoder.sh
new file mode 100755
index 0000000..a8b7fe7
--- /dev/null
+++ b/source/libvpx/test/vp8_multi_resolution_encoder.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx vp8_multi_resolution_encoder example. To add new
+## tests to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to vp8_mre_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8_multi_resolution_encoder_verify_environment() {
+ if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+ local readonly app="vp8_multi_resolution_encoder"
+ if [ -z "$(vpx_tool_path "${app}")" ]; then
+ elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
+ return 1
+ fi
+ fi
+}
+
+# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
+# vp8_multi_resolution_encoder after building path to the executable.
+vp8_mre() {
+ local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull}
+}
+
+vp8_multi_resolution_encoder_three_formats() {
+ local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+ ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+ ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+
+ if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ # Param order:
+ # Input width
+ # Input height
+ # Input file path
+ # Output file names
+ # Output PSNR
+ vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" \
+ "${YUV_RAW_INPUT}" \
+ ${output_files} \
+ 0
+
+ for output_file in ${output_files}; do
+ if [ ! -e "${output_file}" ]; then
+ elog "Missing output file: ${output_file}"
+ return 1
+ fi
+ done
+ fi
+ fi
+}
+
+vp8_mre_tests="vp8_multi_resolution_encoder_three_formats"
+run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}"
diff --git a/source/libvpx/test/vp9_decrypt_test.cc b/source/libvpx/test/vp9_decrypt_test.cc
index 88a3c14..d988612 100644
--- a/source/libvpx/test/vp9_decrypt_test.cc
+++ b/source/libvpx/test/vp9_decrypt_test.cc
@@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp9) {
libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
video.Init();
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP9Decoder decoder(dec_cfg, 0);
video.Begin();
diff --git a/source/libvpx/test/vp9_thread_test.cc b/source/libvpx/test/vp9_thread_test.cc
index d7fc4ee..cc35476 100644
--- a/source/libvpx/test/vp9_thread_test.cc
+++ b/source/libvpx/test/vp9_thread_test.cc
@@ -163,7 +163,7 @@ string DecodeFile(const string& filename, int num_threads) {
libvpx_test::WebMVideoSource video(filename);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = num_threads;
libvpx_test::VP9Decoder decoder(cfg, 0);
diff --git a/source/libvpx/test/vpxenc.sh b/source/libvpx/test/vpxenc.sh
index b6482c6..9674bdc 100755
--- a/source/libvpx/test/vpxenc.sh
+++ b/source/libvpx/test/vpxenc.sh
@@ -41,6 +41,40 @@ vpxenc_can_encode_vp9() {
fi
}
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+ echo ""${YUV_RAW_INPUT}"
+ --width="${YUV_RAW_INPUT_WIDTH}"
+ --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+ local readonly codec="${1:-vp8}"
+ echo "--codec=${codec}
+ --buf-initial-sz=500
+ --buf-optimal-sz=600
+ --buf-sz=1000
+ --cpu-used=-5
+ --end-usage=cbr
+ --error-resilient=1
+ --kf-max-dist=90000
+ --lag-in-frames=0
+ --max-intra-rate=300
+ --max-q=56
+ --min-q=2
+ --noise-sensitivity=0
+ --overshoot-pct=50
+ --passes=1
+ --profile=0
+ --resize-allowed=0
+ --rt
+ --static-thresh=0
+ --undershoot-pct=50"
+}
+
# Wrapper function for running vpxenc with pipe input. Requires that
# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
# input file path and shifted away. All remaining parameters are passed through
@@ -59,9 +93,9 @@ vpxenc_pipe() {
# shifted away. All remaining parameters are passed through to vpxenc.
vpxenc() {
local readonly encoder="$(vpx_tool_path vpxenc)"
- local readonly input="${1}"
+ local readonly input="$1"
shift
- eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" \
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
--test-decode=fatal \
"$@" ${devnull}
}
@@ -69,13 +103,11 @@ vpxenc() {
vpxenc_vp8_ivf() {
if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${TEST_FRAMES}" \
--ivf \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -88,12 +120,10 @@ vpxenc_vp8_webm() {
if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${TEST_FRAMES}" \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -102,17 +132,29 @@ vpxenc_vp8_webm() {
fi
}
+vpxenc_vp8_webm_rt() {
+ if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ $(vpxenc_rt_params vp8) \
+ --output="${output}"
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
vpxenc_vp8_webm_2pass() {
if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${TEST_FRAMES}" \
--output="${output}" \
- --passes=2 \
- "${YUV_RAW_INPUT}"
+ --passes=2
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -127,15 +169,13 @@ vpxenc_vp8_webm_lag10_frames20() {
local readonly lag_total_frames=20
local readonly lag_frames=10
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${lag_total_frames}" \
--lag-in-frames="${lag_frames}" \
--output="${output}" \
--auto-alt-ref=1 \
- --passes=2 \
- "${YUV_RAW_INPUT}"
+ --passes=2
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -147,14 +187,11 @@ vpxenc_vp8_webm_lag10_frames20() {
vpxenc_vp8_ivf_piped_input() {
if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
- cat "${YUV_RAW_INPUT}" \
- | vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
- --limit="${TEST_FRAMES}" \
- --ivf \
- --output="${output}" \
- -
+ vpxenc_pipe $(yuv_input_hantro_collage) \
+ --codec=vp8 \
+ --limit="${TEST_FRAMES}" \
+ --ivf \
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -166,13 +203,11 @@ vpxenc_vp8_ivf_piped_input() {
vpxenc_vp9_ivf() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -185,12 +220,25 @@ vpxenc_vp9_webm() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
+ --output="${output}"
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp9_webm_rt() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ $(vpxenc_rt_params vp9) \
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -203,14 +251,11 @@ vpxenc_vp9_webm_2pass() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
- --test-decode=fatal \
--output="${output}" \
- --passes=2 \
- "${YUV_RAW_INPUT}"
+ --passes=2
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -222,14 +267,12 @@ vpxenc_vp9_webm_2pass() {
vpxenc_vp9_ivf_lossless() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
- --lossless=1 \
- "${YUV_RAW_INPUT}"
+ --lossless=1
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -241,15 +284,13 @@ vpxenc_vp9_ivf_lossless() {
vpxenc_vp9_ivf_minq0_maxq0() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
--min-q=0 \
- --max-q=0 \
- "${YUV_RAW_INPUT}"
+ --max-q=0
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -264,16 +305,13 @@ vpxenc_vp9_webm_lag10_frames20() {
local readonly lag_total_frames=20
local readonly lag_frames=10
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${lag_total_frames}" \
--lag-in-frames="${lag_frames}" \
--output="${output}" \
- --test-decode=fatal \
--passes=2 \
- --auto-alt-ref=1 \
- "${YUV_RAW_INPUT}"
+ --auto-alt-ref=1
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -284,11 +322,13 @@ vpxenc_vp9_webm_lag10_frames20() {
vpxenc_tests="vpxenc_vp8_ivf
vpxenc_vp8_webm
+ vpxenc_vp8_webm_rt
vpxenc_vp8_webm_2pass
vpxenc_vp8_webm_lag10_frames20
vpxenc_vp8_ivf_piped_input
vpxenc_vp9_ivf
vpxenc_vp9_webm
+ vpxenc_vp9_webm_rt
vpxenc_vp9_webm_2pass
vpxenc_vp9_ivf_lossless
vpxenc_vp9_ivf_minq0_maxq0
diff --git a/source/libvpx/test/y4m_test.cc b/source/libvpx/test/y4m_test.cc
index 17cd782..58a6fe3 100644
--- a/source/libvpx/test/y4m_test.cc
+++ b/source/libvpx/test/y4m_test.cc
@@ -57,7 +57,7 @@ static void write_image_file(const vpx_image_t *img, FILE *file) {
for (plane = 0; plane < 3; ++plane) {
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
img->y_chroma_shift : img->d_h);
const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
diff --git a/source/libvpx/third_party/libyuv/README.libvpx b/source/libvpx/third_party/libyuv/README.libvpx
index fa5b498..3869d25 100644
--- a/source/libvpx/third_party/libyuv/README.libvpx
+++ b/source/libvpx/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1041
+Version: 1060
License: BSD
License File: LICENSE
@@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
-None.
+cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
diff --git a/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 82fd95d..8423121 100644
--- a/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -153,7 +153,6 @@ class LIBYUV_API MJpegDecoder {
int* subsample_x, int* subsample_y, int number_of_components);
private:
-
void AllocOutputBuffers(int num_outbufs);
void DestroyOutputBuffers();
diff --git a/source/libvpx/third_party/libyuv/include/libyuv/row.h b/source/libvpx/third_party/libyuv/include/libyuv/row.h
index fdfe1ae..4b3c870 100644
--- a/source/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/source/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -252,6 +252,94 @@ extern "C" {
// The following are available on arm64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+// #define HAS_I444TOARGBROW_NEON
+// #define HAS_I422TOARGBROW_NEON
+// #define HAS_I411TOARGBROW_NEON
+// #define HAS_I422TOBGRAROW_NEON
+// #define HAS_I422TOABGRROW_NEON
+// #define HAS_I422TORGBAROW_NEON
+// #define HAS_I422TORGB24ROW_NEON
+// #define HAS_I422TORAWROW_NEON
+// #define HAS_I422TORGB565ROW_NEON
+// #define HAS_I422TOARGB1555ROW_NEON
+// #define HAS_I422TOARGB4444ROW_NEON
+// #define HAS_YTOARGBROW_NEON
+// #define HAS_I400TOARGBROW_NEON
+// #define HAS_NV12TOARGBROW_NEON
+// #define HAS_NV21TOARGBROW_NEON
+// #define HAS_NV12TORGB565ROW_NEON
+// #define HAS_NV21TORGB565ROW_NEON
+// #define HAS_YUY2TOARGBROW_NEON
+// #define HAS_UYVYTOARGBROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROWS_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+// #define HAS_RGB565TOARGBROW_NEON
+// #define HAS_ARGB1555TOARGBROW_NEON
+// #define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+// #define HAS_ARGBTORGB565ROW_NEON
+// #define HAS_ARGBTOARGB1555ROW_NEON
+// #define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+// #define HAS_ARGBTOUV444ROW_NEON
+// #define HAS_ARGBTOUV422ROW_NEON
+// #define HAS_ARGBTOUV411ROW_NEON
+// #define HAS_ARGBTOUVROW_NEON
+// #define HAS_ARGBTOUVJROW_NEON
+// #define HAS_BGRATOUVROW_NEON
+// #define HAS_ABGRTOUVROW_NEON
+// #define HAS_RGBATOUVROW_NEON
+// #define HAS_RGB24TOUVROW_NEON
+// #define HAS_RAWTOUVROW_NEON
+// #define HAS_RGB565TOUVROW_NEON
+// #define HAS_ARGB1555TOUVROW_NEON
+// #define HAS_ARGB4444TOUVROW_NEON
+// #define HAS_RGB565TOYROW_NEON
+// #define HAS_ARGB1555TOYROW_NEON
+// #define HAS_ARGB4444TOYROW_NEON
+// #define HAS_BGRATOYROW_NEON
+// #define HAS_ABGRTOYROW_NEON
+// #define HAS_RGBATOYROW_NEON
+// #define HAS_RGB24TOYROW_NEON
+// #define HAS_RAWTOYROW_NEON
+// #define HAS_INTERPOLATEROW_NEON
+// #define HAS_ARGBBLENDROW_NEON
+// #define HAS_ARGBATTENUATEROW_NEON
+// #define HAS_ARGBQUANTIZEROW_NEON
+// #define HAS_ARGBSHADEROW_NEON
+// #define HAS_ARGBGRAYROW_NEON
+// #define HAS_ARGBSEPIAROW_NEON
+// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
#endif
// The following are available on Neon platforms:
@@ -465,7 +553,7 @@ typedef uint8 uvec8[16];
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
#endif // defined(__native_client__) && defined(__x86_64__)
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
diff --git a/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index 8dc0762..3c49542 100644
--- a/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -51,6 +51,14 @@ extern "C" {
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
+#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__aarch64__) || defined(LIBYUV_NEON))
+/* #define HAS_SCALEROWDOWN2_NEON */
+/* #define HAS_SCALEROWDOWN4_NEON */
+/* #define HAS_SCALEROWDOWN34_NEON */
+/* #define HAS_SCALEROWDOWN38_NEON */
+/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
+/* #define HAS_SCALEARGBROWDOWN2_NEON */
#endif
// The following are available on Mips platforms:
diff --git a/source/libvpx/third_party/libyuv/include/libyuv/version.h b/source/libvpx/third_party/libyuv/include/libyuv/version.h
index 912c4c9..73a7f1b 100644
--- a/source/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/source/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1041
+#define LIBYUV_VERSION 1059
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/libvpx/third_party/libyuv/source/compare.cc b/source/libvpx/third_party/libyuv/source/compare.cc
index 9ea81b4..dc715e0 100644
--- a/source/libvpx/third_party/libyuv/source/compare.cc
+++ b/source/libvpx/third_party/libyuv/source/compare.cc
@@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
diff --git a/source/libvpx/third_party/libyuv/source/compare_neon.cc b/source/libvpx/third_party/libyuv/source/compare_neon.cc
index 5e7b8e4..55052c0 100644
--- a/source/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/source/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %2, %2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "bgt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
#endif // __ARM_NEON__
#ifdef __cplusplus
diff --git a/source/libvpx/third_party/libyuv/source/convert.cc b/source/libvpx/third_party/libyuv/source/convert.cc
index 874a6cb..a8e294f 100644
--- a/source/libvpx/third_party/libyuv/source/convert.cc
+++ b/source/libvpx/third_party/libyuv/source/convert.cc
@@ -401,7 +401,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
- int halfheight = (height + 1) >> 1;
+ int halfheight;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) = YUY2ToUV422Row_C;
@@ -711,11 +711,13 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -963,9 +965,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1022,36 +1021,44 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
+ {
+#if !defined(HAS_RGB24TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
- RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
+ }
#if !defined(HAS_RGB24TOYROW_NEON)
- free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row);
#endif
+ }
return 0;
}
@@ -1075,9 +1082,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_raw || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1134,36 +1138,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ for (y = 0; y < height - 1; y += 2) {
+ #if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+ #else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ #endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ #if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ #else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ #endif
+ }
+ #if !defined(HAS_RAWTOYROW_NEON)
+ free_aligned_buffer_64(row);
+ #endif
}
-#if !defined(HAS_RAWTOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
return 0;
}
@@ -1187,9 +1197,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1246,36 +1253,44 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB565TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
+ {
+#if !defined(HAS_RGB565TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
- RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
#else
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
#else
- RGB565ToARGBRow(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
+ }
#if !defined(HAS_RGB565TOYROW_NEON)
- free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row);
#endif
+ }
return 0;
}
@@ -1299,9 +1314,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1358,38 +1370,45 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB1555TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
+ {
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
- ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
- width);
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
+ }
#if !defined(HAS_ARGB1555TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
+ }
return 0;
}
@@ -1413,9 +1432,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1472,38 +1488,46 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB4444TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
+ {
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
- ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
- width);
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
+ }
#if !defined(HAS_ARGB4444TOYROW_NEON)
- free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row);
#endif
+ }
return 0;
}
diff --git a/source/libvpx/third_party/libyuv/source/convert_from_argb.cc b/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
index 121a416..de461dd 100644
--- a/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
}
}
}
+#elif defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
- ARGBToUV444Row = ARGBToUV444Row_NEON;
}
}
#endif
@@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
@@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -228,11 +234,13 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 32) {
- ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUV411Row = ARGBToUV411Row_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+ ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV411Row = ARGBToUV411Row_NEON;
}
}
#endif
@@ -261,9 +269,6 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
@@ -296,11 +301,13 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -331,22 +338,27 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
}
- free_aligned_buffer_64(row_u);
return 0;
}
@@ -364,9 +376,6 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
@@ -399,11 +408,13 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -434,22 +445,27 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
}
- free_aligned_buffer_64(row_u);
return 0;
}
@@ -493,6 +509,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -510,12 +533,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -594,6 +611,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -611,12 +635,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -1022,11 +1040,13 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
- if (width >= 16) {
- ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
#endif
diff --git a/source/libvpx/third_party/libyuv/source/cpu_id.cc b/source/libvpx/third_party/libyuv/source/cpu_id.cc
index 2e0d61d..8f8a403 100644
--- a/source/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/source/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -14,8 +14,9 @@
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
- !defined(__native_client__) && defined(_M_X64) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+ !defined(__native_client__) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+ (defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv()
#endif
@@ -97,7 +98,7 @@ int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
-#elif defined(_M_IX86)
+#elif defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
@@ -256,12 +257,17 @@ int InitCpuFlags(void) {
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#elif defined(__aarch64__)
+ cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
diff --git a/source/libvpx/third_party/libyuv/source/format_conversion.cc b/source/libvpx/third_party/libyuv/source/format_conversion.cc
index a3daf96..3c17371 100644
--- a/source/libvpx/third_party/libyuv/source/format_conversion.cc
+++ b/source/libvpx/third_party/libyuv/source/format_conversion.cc
@@ -332,11 +332,13 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
diff --git a/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
index 15b0ed8..36028c3 100644
--- a/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
@@ -13,8 +13,8 @@
#ifdef HAVE_JPEG
#include <assert.h>
-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
- !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
@@ -101,7 +101,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
}
buf_.data = src;
- buf_.len = (int)(src_len);
+ buf_.len = static_cast<int>(src_len);
buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP
@@ -411,7 +411,7 @@ void init_source(j_decompress_ptr cinfo) {
}
boolean fill_input_buffer(j_decompress_ptr cinfo) {
- BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+ BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data");
// ERROR: No more data
@@ -447,7 +447,7 @@ void ErrorHandler(j_common_ptr cinfo) {
// ERROR: Error in jpeglib: buf
#endif
- SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+ SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
diff --git a/source/libvpx/third_party/libyuv/source/row_any.cc b/source/libvpx/third_party/libyuv/source/row_any.cc
index 97ef844..ce8b3da 100644
--- a/source/libvpx/third_party/libyuv/source/row_any.cc
+++ b/source/libvpx/third_party/libyuv/source/row_any.cc
@@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
+#endif // HAS_I422TOARGBROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+#endif // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I422TOARGBROW_NEON
+#endif // HAS_I422TOUYVYROW_NEON
#undef YANY
// Wrappers to handle odd width
@@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif
#undef YANY
@@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
diff --git a/source/libvpx/third_party/libyuv/source/row_neon64.cc b/source/libvpx/third_party/libyuv/source/row_neon64.cc
index 46e9ceb..21111cf 100644
--- a/source/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/source/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store U
+ "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2)
- "vst1.8 {q1}, [%2]! \n" // store V
+ "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load U
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load V
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
- "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
@@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#ifdef HAS_SETROW_NEON
void SetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
+ "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
- : "cc", "memory", "q0"
+ : "cc", "memory", "v0"
);
}
#endif // HAS_SETROW_NEON
@@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
// Start at end of source row.
- "mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %3, %3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n"
+ "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0"
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +1005,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
#ifdef HAS_RGB24TOARGBROW_NEON
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_RGB24TOARGBROW_NEON
@@ -1028,21 +1026,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
#ifdef HAS_RAWTOARGBROW_NEON
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v3.8b, v1.8b \n" // move g
+ "mov v4.8b, v0.8b \n" // move r
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_RAWTOARGBROW_NEON
@@ -1170,16 +1169,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_ARGBTORGB24ROW_NEON
@@ -1190,17 +1189,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v4.8b, v2.8b \n" // mov g
+ "mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTORAWROW_NEON
@@ -1211,16 +1211,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_YUY2TOYROW_NEON
@@ -1231,16 +1231,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_UYVYTOYROW_NEON
@@ -1252,19 +1252,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_YUY2TOUV422ROW_NEON
@@ -1276,19 +1276,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_UYVYTOUV422ROW_NEON
@@ -1297,20 +1297,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
@@ -1318,7 +1318,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_YUY2TOUVROW_NEON
@@ -1327,20 +1327,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
@@ -1348,7 +1348,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_UYVYTOUVROW_NEON
@@ -1358,23 +1358,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
asm volatile (
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %x1, %x0, %w1, sxtw \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
- "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
+ "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
+ "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_HALFROW_NEON
@@ -1384,22 +1384,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
- "vmov.u32 d6[0], %3 \n" // selector
+ "mov v2.s[0], %w3 \n" // selector
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
- "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
- "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
- "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
+ "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
+ "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n" // store 8.
+ "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERROW_NEON
@@ -1411,16 +1411,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERGGROW_NEON
@@ -1431,21 +1431,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 4.
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
#endif // HAS_ARGBSHUFFLEROW_NEON
@@ -1459,14 +1458,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v2.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1474,7 +1474,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"+r"(dst_yuy2), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOYUY2ROW_NEON
@@ -1488,14 +1488,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v3.8b, v2.8b \n"
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1503,7 +1504,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"+r"(dst_uyvy), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOUYVYROW_NEON
@@ -1577,28 +1578,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
#ifdef HAS_ARGBTOYROW_NEON
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1607,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOYJROW_NEON
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBTOYJROW_NEON
@@ -3048,20 +3049,20 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3069,7 +3070,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBMULTIPLYROW_NEON
@@ -3083,14 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3098,7 +3101,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBADDROW_NEON
@@ -3112,14 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3127,7 +3132,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +3146,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "mov v1.8b, v0.8b \n"
+ "mov v2.8b, v0.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELROW_NEON
@@ -3175,20 +3180,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +3207,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELXYROW_NEON
@@ -3236,28 +3241,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%5 \n" // top
+ "ld1 {v0.8b}, [%0],%5 \n" // top
MEMACCESS(0)
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(2)
- "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2)
- "vld1.8 {d3}, [%2],%6 \n"
+ "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3)
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3266,7 +3271,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %4
: "r"(2), // %5
"r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELXROW_NEON
@@ -3282,28 +3287,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%4 \n" // left
+ "ld1 {v0.8b}, [%0],%4 \n" // left
MEMACCESS(1)
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%5 \n" // right
+ "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%5 \n"
+ "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3311,7 +3316,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %3
: "r"(1), // %4
"r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELYROW_NEON
diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc
index 8eb8889..d79c353 100644
--- a/source/libvpx/third_party/libyuv/source/row_win.cc
+++ b/source/libvpx/third_party/libyuv/source/row_win.cc
@@ -10,7 +10,7 @@
#include "libyuv/row.h"
-#if defined (_M_X64)
+#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16
#endif
@@ -21,7 +21,8 @@ extern "C" {
#endif
// This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
@@ -78,7 +79,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
-
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
@@ -132,7 +132,6 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
-
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
diff --git a/source/libvpx/third_party/libyuv/source/scale_neon64.cc b/source/libvpx/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..64c7d10
--- /dev/null
+++ b/source/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,790 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ MEMACCESS(1)
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ MEMACCESS(3)
+ "vld1.8 {q1}, [%3]! \n"
+ MEMACCESS(4)
+ "vld1.8 {q2}, [%4]! \n"
+ MEMACCESS(5)
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ MEMACCESS(1)
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+#endif //ScaleRowDown34_0_Box_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q3}, [%3] \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d4}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.16 {q13}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q14}, [%6] \n"
+ MEMACCESS(7)
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ MEMACCESS(4)
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.16 {q13}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#if 0
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ MEMACCESS(0)
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ );
+}
+#endif //0
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS(0)
+ "vld2.32 {q0, q1}, [%0]! \n"
+ MEMACCESS(0)
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ MEMACCESS(1)
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d4}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d5}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d6}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_SCALEARGBROWDOWNEVEN_NEON
+#endif // __aarch64__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/libvpx/tools_common.c b/source/libvpx/tools_common.c
index 7cfd066..2ec1711 100644
--- a/source/libvpx/tools_common.c
+++ b/source/libvpx/tools_common.c
@@ -83,7 +83,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
int plane = 0;
int shortread = 0;
- const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
uint8_t *ptr;
@@ -241,7 +241,8 @@ int vpx_img_read(vpx_image_t *img, FILE *file) {
for (plane = 0; plane < 3; ++plane) {
unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane);
+ const int w = vpx_img_plane_width(img, plane) *
+ ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
const int h = vpx_img_plane_height(img, plane);
int y;
diff --git a/source/libvpx/tools_common.h b/source/libvpx/tools_common.h
index 558413e..c1f466b 100644
--- a/source/libvpx/tools_common.h
+++ b/source/libvpx/tools_common.h
@@ -103,17 +103,25 @@ struct VpxInputContext {
extern "C" {
#endif
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
/* Sets a stdio stream into binary mode */
FILE *set_binary_mode(FILE *stream);
-void die(const char *fmt, ...);
-void fatal(const char *fmt, ...);
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
void warn(const char *fmt, ...);
-void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
/* The tool including this file must define usage_exit() */
-void usage_exit();
+void usage_exit() VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
diff --git a/source/libvpx/vp8/common/arm/loopfilter_arm.c b/source/libvpx/vp8/common/arm/loopfilter_arm.c
index f37ca63..5840c2b 100644
--- a/source/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/source/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -25,22 +25,18 @@ extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
-#if HAVE_NEON_ASM || HAVE_NEON
+#if HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
-#endif
-#if HAVE_NEON_ASM
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-#endif
-#if HAVE_NEON
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
@@ -150,9 +146,7 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
-#endif
-#if HAVE_NEON_ASM
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
index d77f2ba..9824a31 100644
--- a/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,7 @@
#include <arm_neon.h>
-static const uint16_t bifilter4_coeff[8][2] = {
+static const uint8_t bifilter4_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
@@ -64,8 +64,8 @@ void vp8_bilinear_predict4x4_neon(
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@@ -155,8 +155,8 @@ void vp8_bilinear_predict8x4_neon(
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@@ -245,8 +245,8 @@ void vp8_bilinear_predict8x8_neon(
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
diff --git a/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
deleted file mode 100644
index a8730aa..0000000
--- a/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null
@@ -1,595 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_build_intra_predictors_mby_neon_func|
- EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
-
- cmp r3, #0
- beq case_dc_pred
- cmp r3, #1
- beq case_v_pred
- cmp r3, #2
- beq case_h_pred
- cmp r3, #3
- beq case_tm_pred
-
-case_dc_pred
- ldr r4, [sp, #88] ; Up
- ldr r5, [sp, #92] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up
-
- cmp r5, #0
- beq skip_dc_pred_left
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-case_v_pred
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_h_pred
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_tm_pred
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- subs r12, r12, #1
- bne case_tm_pred_loop
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
- ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
-
- mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
- cmp r3, #0
- beq case_dc_pred_s
- cmp r3, #1
- beq case_v_pred_s
- cmp r3, #2
- beq case_h_pred_s
- cmp r3, #3
- beq case_tm_pred_s
-
-case_dc_pred_s
- ldr r4, [sp, #88] ; Up
- ldr r5, [sp, #92] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left_s
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up_s
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up_s
-
- cmp r5, #0
- beq skip_dc_pred_left_s
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left_s
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left_s
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-case_v_pred_s
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_h_pred_s
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_tm_pred_s
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop_s
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- subs r12, r12, #1
- bne case_tm_pred_loop_s
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
- ENDP
-
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
deleted file mode 100644
index 3a39210..0000000
--- a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,81 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq,
-; unsigned char *dst, int stride);
-; r0 *q
-; r1 dq
-; r2 *dst
-; r3 stride
-|idct_dequant_0_2x_neon| PROC
- push {r4, r5}
- vpush {d8-d15}
-
- add r12, r2, #4
- vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d8[0]}, [r12], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d8[1]}, [r12], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d10[1]}, [r12], r3
-
- ldrh r12, [r0] ; lo q
- ldrh r4, [r0, #32] ; hi q
- mov r5, #0
- strh r5, [r0]
- strh r5, [r0, #32]
-
- sxth r12, r12 ; lo
- mul r0, r12, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q0, r0
- sxth r4, r4 ; hi
- mul r0, r4, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r0, r2, #4
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d10[1]}, [r0]
-
- vpop {d8-d15}
- pop {r4, r5}
- bx lr
-
- ENDP ; |idct_dequant_0_2x_neon|
- END
diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000..967c322
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+ int16_t *q,
+ int16_t dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int i, a0, a1;
+ int16x8x2_t q2Add;
+ int32x2_t d2s32, d4s32;
+ uint8x8_t d2u8, d4u8;
+ uint16x8_t q1u16, q2u16;
+
+ a0 = ((q[0] * dq) + 4) >> 3;
+ a1 = ((q[16] * dq) + 4) >> 3;
+ q[0] = q[16] = 0;
+ q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+ q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+ for (i = 0; i < 2; i++, dst += 4) {
+ dst0 = dst;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d2s32));
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d4s32));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+ d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+ d2s32 = vreinterpret_s32_u8(d2u8);
+ d4s32 = vreinterpret_s32_u8(d4u8);
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+ }
+ return;
+}
diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
deleted file mode 100644
index 8da0fa0..0000000
--- a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-|idct_dequant_full_2x_neon| PROC
- vpush {d8-d15}
-
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
-
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r3 ; l pre
- vld1.32 {d28[1]}, [r12], r3 ; r pre
- vld1.32 {d29[0]}, [r2], r3
- vld1.32 {d29[1]}, [r12], r3
- vld1.32 {d30[0]}, [r2], r3
- vld1.32 {d30[1]}, [r12], r3
- vld1.32 {d31[0]}, [r2], r3
- vld1.32 {d31[1]}, [r12]
-
- adr r1, cospi8sqrt2minus1 ; pointer to the first constant
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- vld1.16 {d0}, [r1]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r1, r2, #4 ; hi
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- vst1.32 {d0[0]}, [r2], r3 ; lo
- vst1.32 {d0[1]}, [r1], r3 ; hi
- vst1.32 {d1[0]}, [r2], r3
- vst1.32 {d1[1]}, [r1], r3
- vst1.32 {d2[0]}, [r2], r3
- vst1.32 {d2[1]}, [r1], r3
- vst1.32 {d3[0]}, [r2]
- vst1.32 {d3[1]}, [r1]
-
- vpop {d8-d15}
- bx lr
-
- ENDP ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000..a60ed46
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+ int16_t *q,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0, *dst1;
+ int32x2_t d28, d29, d30, d31;
+ int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x4x2_t q2tmp0, q2tmp1;
+ int16x8x2_t q2tmp2, q2tmp3;
+ int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+ d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+ // load dq
+ q0 = vld1q_s16(dq);
+ dq += 8;
+ q1 = vld1q_s16(dq);
+
+ // load q
+ q2 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q3 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q4 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q5 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+
+ // load src from dst
+ dst0 = dst;
+ dst1 = dst + 4;
+ d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+ d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+ q2 = vmulq_s16(q2, q0);
+ q3 = vmulq_s16(q3, q1);
+ q4 = vmulq_s16(q4, q0);
+ q5 = vmulq_s16(q5, q1);
+
+ // vswp
+ dLow0 = vget_low_s16(q2);
+ dHigh0 = vget_high_s16(q2);
+ dLow1 = vget_low_s16(q4);
+ dHigh1 = vget_high_s16(q4);
+ q2 = vcombine_s16(dLow0, dLow1);
+ q4 = vcombine_s16(dHigh0, dHigh1);
+
+ dLow0 = vget_low_s16(q3);
+ dHigh0 = vget_high_s16(q3);
+ dLow1 = vget_low_s16(q5);
+ dHigh1 = vget_high_s16(q5);
+ q3 = vcombine_s16(dLow0, dLow1);
+ q5 = vcombine_s16(dHigh0, dHigh1);
+
+ q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+ q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+ q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+ q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+ q10 = vqaddq_s16(q2, q3);
+ q11 = vqsubq_s16(q2, q3);
+
+ q8 = vshrq_n_s16(q8, 1);
+ q9 = vshrq_n_s16(q9, 1);
+
+ q4 = vqaddq_s16(q4, q8);
+ q5 = vqaddq_s16(q5, q9);
+
+ q2 = vqsubq_s16(q6, q5);
+ q3 = vqaddq_s16(q7, q4);
+
+ q4 = vqaddq_s16(q10, q3);
+ q5 = vqaddq_s16(q11, q2);
+ q6 = vqsubq_s16(q11, q2);
+ q7 = vqsubq_s16(q10, q3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ // loop 2
+ q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+ q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+ q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+ q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+ q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+ q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+ q10 = vshrq_n_s16(q10, 1);
+ q11 = vshrq_n_s16(q11, 1);
+
+ q10 = vqaddq_s16(q2tmp2.val[1], q10);
+ q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+ q8 = vqsubq_s16(q8, q11);
+ q9 = vqaddq_s16(q9, q10);
+
+ q4 = vqaddq_s16(q2, q9);
+ q5 = vqaddq_s16(q3, q8);
+ q6 = vqsubq_s16(q3, q8);
+ q7 = vqsubq_s16(q2, q9);
+
+ q4 = vrshrq_n_s16(q4, 3);
+ q5 = vrshrq_n_s16(q5, 3);
+ q6 = vrshrq_n_s16(q6, 3);
+ q7 = vrshrq_n_s16(q7, 3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+ vreinterpret_u8_s32(d28)));
+ q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+ vreinterpret_u8_s32(d29)));
+ q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+ vreinterpret_u8_s32(d30)));
+ q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+ vreinterpret_u8_s32(d31)));
+
+ d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+ d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+ d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+ d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+ dst0 = dst;
+ dst1 = dst + 4;
+ vst1_lane_s32((int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ vst1_lane_s32((int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d31, 0);
+ vst1_lane_s32((int32_t *)dst1, d31, 1);
+ return;
+}
diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
deleted file mode 100644
index c4f09c7..0000000
--- a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,409 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
- EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
- EXPORT |vp8_loop_filter_vertical_edge_y_neon|
- EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1
- add r1, r1, r1
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vld1.u8 {q3}, [r2@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r2@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r2@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r2@128] ; q2
- vld1.u8 {q10}, [r12@128] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r12, r12, r1, lsl #1
-
- bl vp8_loop_filter_neon
-
- vst1.u8 {q5}, [r2@128], r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r2@128], r1 ; store oq0
- vst1.u8 {q8}, [r12@128], r1 ; store oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- ldr r12, [sp, #68] ; load thresh
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q2, r12 ; duplicate thresh
-
- sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
- sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r3@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r3@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r3@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r3@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r3@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r3@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r3@64] ; q3
- vld1.u8 {d21}, [r12@64] ; q3
-
- bl vp8_loop_filter_neon
-
- sub r0, r0, r1, lsl #1
- sub r2, r2, r1, lsl #1
-
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r2@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r2@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r2@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64] ; store u oq1
- vst1.u8 {d17}, [r2@64] ; store v oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, #4 ; src ptr down by 4 columns
- add r1, r1, r1
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1, asr #1
-
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d10}, [r2], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d14}, [r2], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d18}, [r2], r1
- vld1.u8 {d20}, [r12], r1
-
- vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d11}, [r2], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d15}, [r2], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d19}, [r2]
- vld1.u8 {d21}, [r12]
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
-
- sub r0, r0, #2 ; dst ptr
-
- vswp d14, d12
- vswp d16, d15
-
- add r12, r0, r1, asr #1
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- sub r12, r0, #4 ; move u pointer down by 4 columns
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q1, r3 ; duplicate limit
- sub r3, r2, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r12], r1 ;load u data
- vld1.u8 {d7}, [r3], r1 ;load v data
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d9}, [r3], r1
- vld1.u8 {d10}, [r12], r1
- vld1.u8 {d11}, [r3], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d13}, [r3], r1
- vld1.u8 {d14}, [r12], r1
- vld1.u8 {d15}, [r3], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r12], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r12]
- vld1.u8 {d21}, [r3]
-
- ldr r12, [sp, #68] ; load thresh
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r12 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
- vswp d14, d12
- vswp d16, d15
-
- sub r0, r0, #2
- sub r2, r2, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-; void vp8_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0 flimit
-; q1 limit
-; q2 thresh
-; q3 p3
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3
-|vp8_loop_filter_neon| PROC
-
- ; vp8_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q3, q9, q8 ; abs(q2 - q1)
- vabd.u8 q4, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q3, q3, q4
- vmax.u8 q15, q11, q12
-
- vabd.u8 q9, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 q15, q15, q3
-
- vmov.u8 q10, #0x80 ; 0x80
-
- vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
-
- vcge.u8 q15, q1, q15
-
- ; vp8_filter() function
- ; convert to signed
- veor q7, q7, q10 ; qs0
- vshr.u8 q2, q2, #1 ; a = a / 2
- veor q6, q6, q10 ; ps0
-
- veor q5, q5, q10 ; ps1
- vqadd.u8 q9, q9, q2 ; a = b + a
-
- veor q8, q8, q10 ; qs1
-
- vmov.u8 q10, #3 ; #3
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q11, d15, d13
-
- vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
-
- vmovl.u8 q4, d20
-
- vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
- vorr q14, q13, q14 ; vp8_hevmask
-
- vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
- vmul.i16 q11, q11, q4
-
- vand q1, q1, q14 ; vp8_filter &= hev
- vand q15, q15, q9 ; vp8_filter_mask
-
- vaddw.s8 q2, q2, d2
- vaddw.s8 q11, q11, d3
-
- vmov.u8 q9, #4 ; #4
-
- ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q11
- vand q1, q1, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
- vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q1, q1, #3 ; Filter1 >>= 3
-
-
- vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
-
- ; outer tap adjustments: ++vp8_filter >> 1
- vrshr.s8 q1, q1, #1
- vbic q1, q1, q14 ; vp8_filter &= ~hev
- vmov.u8 q0, #0x80 ; 0x80
- vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
- vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- veor q5, q13, q0 ; *op1 = u^0x80
- veor q8, q12, q0 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
new file mode 100644
index 0000000..0bec7fb
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_neon(
+ uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#else
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#endif
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
deleted file mode 100644
index 78d13c8..0000000
--- a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_bvs_neon|
- EXPORT |vp8_loop_filter_mbvs_neon|
- ARM
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *s, PRESERVE
-; r1 int p, PRESERVE
-; q1 limit, PRESERVE
-
-|vp8_loop_filter_simple_vertical_edge_neon| PROC
- vpush {d8-d15}
-
- sub r0, r0, #2 ; move src pointer down by 2 columns
- add r12, r1, r1
- add r3, r0, r1
-
- vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
- vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
- vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
- vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
- vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
- vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
- vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
- vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
- vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
- vswp d7, d10
- vswp d12, d9
-
- ;vp8_filter_mask() function
- ;vp8_hevmask() function
- sub r0, r0, r1, lsl #4
- vabd.u8 q15, q5, q4 ; abs(p0 - q0)
- vabd.u8 q14, q3, q6 ; abs(p1 - q1)
-
- vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
- vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
- vmov.u8 q0, #0x80 ; 0x80
- vmov.s16 q11, #3
- vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
- veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
- veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
- veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
-
- vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
- vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
- vsubl.s8 q13, d9, d11
-
- vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
- vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
- vmul.s16 q13, q13, q11
-
- vmov.u8 q11, #0x03 ; 0x03
- vmov.u8 q12, #0x04 ; 0x04
-
- vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d29
-
- vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d29, q13
-
- add r0, r0, #1
- add r3, r0, r1
-
- vand q14, q14, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q14, q3, #3 ; Filter1 >>= 3
-
- ;calculate output
- vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
- vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- add r12, r1, r1
- vswp d13, d14
-
- ;store op1, op0, oq0, oq1
- vst2.8 {d12[0], d13[0]}, [r0], r12
- vst2.8 {d12[1], d13[1]}, [r3], r12
- vst2.8 {d12[2], d13[2]}, [r0], r12
- vst2.8 {d12[3], d13[3]}, [r3], r12
- vst2.8 {d12[4], d13[4]}, [r0], r12
- vst2.8 {d12[5], d13[5]}, [r3], r12
- vst2.8 {d12[6], d13[6]}, [r0], r12
- vst2.8 {d12[7], d13[7]}, [r3], r12
- vst2.8 {d14[0], d15[0]}, [r0], r12
- vst2.8 {d14[1], d15[1]}, [r3], r12
- vst2.8 {d14[2], d15[2]}, [r0], r12
- vst2.8 {d14[3], d15[3]}, [r3], r12
- vst2.8 {d14[4], d15[4]}, [r0], r12
- vst2.8 {d14[5], d15[5]}, [r3], r12
- vst2.8 {d14[6], d15[6]}, [r0], r12
- vst2.8 {d14[7], d15[7]}, [r3]
-
- vpop {d8-d15}
- bx lr
- ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_bvs_neon| PROC
- push {r4, lr}
- ldrb r3, [r2] ; load blim from mem
- mov r4, r0
- add r0, r0, #4
- vdup.s8 q1, r3 ; duplicate blim
- bl vp8_loop_filter_simple_vertical_edge_neon
- ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
- add r0, r4, #8
- bl vp8_loop_filter_simple_vertical_edge_neon
- add r0, r4, #12
- pop {r4, lr}
- b vp8_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp8_loop_filter_bvs_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_mbvs_neon| PROC
- ldrb r3, [r2] ; load mblim from mem
- vdup.s8 q1, r3 ; duplicate mblim
- b vp8_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp8_loop_filter_bvs_neon|
- END
diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000..d5178bb
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
+
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
+}
+#else
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+ const uint8x8x2_t result) {
+ /*
+ * uint8x8x2_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ ---
+ * after vtrn_u8
+ 00 10 02 12 | 04 14 06 16
+ 01 11 03 13 | 05 15 07 17
+ */
+ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+ result.val[1]);
+ const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+ const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ write_2x4(dst, pitch, result);
+ dst += pitch * 8;
+ write_2x4(dst, pitch, result2);
+}
+#endif
+
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ const uint8x8_t a = vld1_u8(src);
+ const uint8x8_t b = vld1_u8(src + pitch * 1);
+ const uint8x8_t c = vld1_u8(src + pitch * 2);
+ const uint8x8_t d = vld1_u8(src + pitch * 3);
+ const uint8x8_t e = vld1_u8(src + pitch * 4);
+ const uint8x8_t f = vld1_u8(src + pitch * 5);
+ const uint8x8_t g = vld1_u8(src + pitch * 6);
+ const uint8x8_t h = vld1_u8(src + pitch * 7);
+ const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+ vreinterpret_u32_u8(e));
+ const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+ vreinterpret_u32_u8(f));
+ const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+ vreinterpret_u32_u8(g));
+ const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+ vreinterpret_u32_u8(h));
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+ vreinterpret_u16_u32(r26_u32.val[0]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+ vreinterpret_u16_u32(r37_u32.val[0]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ /*
+ * after vtrn_u32
+ 00 01 02 03 | 40 41 42 43
+ 10 11 12 13 | 50 51 52 53
+ 20 21 22 23 | 60 61 62 63
+ 30 31 32 33 | 70 71 72 73
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 40 41 60 61
+ 02 03 22 23 | 42 43 62 63
+ 10 11 30 31 | 50 51 70 71
+ 12 13 32 33 | 52 52 72 73
+
+ 00 01 20 21 | 40 41 60 61
+ 10 11 30 31 | 50 51 70 71
+ 02 03 22 23 | 42 43 62 63
+ 12 13 32 33 | 52 52 72 73
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 40 50 60 70
+ 01 11 21 31 | 41 51 61 71
+ 02 12 22 32 | 42 52 62 72
+ 03 13 23 33 | 43 53 63 73
+ */
+ x.val[0] = r01_u8.val[0];
+ x.val[1] = r01_u8.val[1];
+ x.val[2] = r23_u8.val[0];
+ x.val[3] = r23_u8.val[1];
+
+ return x;
+}
+#endif
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ unsigned char *src1;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+ int16x8_t q2s16, q13s16, q11s16;
+ int8x8_t d28s8, d29s8;
+ int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+ uint8x8x4_t d0u8x4; // d6, d7, d8, d9
+ uint8x8x4_t d1u8x4; // d10, d11, d12, d13
+ uint8x8x2_t d2u8x2; // d12, d13
+ uint8x8x2_t d3u8x2; // d14, d15
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ src1 = s - 2;
+ d0u8x4 = read_4x8(src1, p, d0u8x4);
+ src1 += p * 8;
+ d1u8x4 = read_4x8(src1, p, d1u8x4);
+
+ q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
+ q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
+ q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
+ q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
+
+ q15u8 = vabdq_u8(q5u8, q4u8);
+ q14u8 = vabdq_u8(q3u8, q6u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q11s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q3u8 = veorq_u8(q3u8, q0u8);
+ q4u8 = veorq_u8(q4u8, q0u8);
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+ q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+ vreinterpretq_s8_u8(q6u8));
+
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q11u8 = vdupq_n_u8(3);
+ q12u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+ d28s8 = vqmovn_s16(q2s16);
+ d29s8 = vqmovn_s16(q13s16);
+ q14s8 = vcombine_s8(d28s8, d29s8);
+
+ q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q14s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ d2u8x2.val[0] = vget_low_u8(q6u8); // d12
+ d2u8x2.val[1] = vget_low_u8(q7u8); // d14
+ d3u8x2.val[0] = vget_high_u8(q6u8); // d13
+ d3u8x2.val[1] = vget_high_u8(q7u8); // d15
+
+ src1 = s - 1;
+ write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/source/libvpx/vp8/common/arm/neon/reconintra_neon.c b/source/libvpx/vp8/common/arm/neon/reconintra_neon.c
new file mode 100644
index 0000000..af52cd5
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/reconintra_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride) {
+ const int mode = x->mode_info_context->mbmi.mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x16_t v_expected_dc = vdupq_n_u8(128);
+
+ if (shift) {
+ unsigned int average = 0;
+ int expected_dc;
+ if (x->up_available) {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ const uint16x8_t a = vpaddlq_u8(v_above);
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ average = vget_lane_u32(d, 0);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 16; ++i) {
+ average += yleft[0];
+ yleft += left_stride;
+ }
+ }
+ shift += 3;
+ expected_dc = (average + (1 << (shift - 1))) >> shift;
+ v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
+ }
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_expected_dc);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_above);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
+ yleft += left_stride;
+ vst1q_u8(ypred_ptr, v_yleft);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
+ const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
+ const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
+ const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const uint8x8_t pred_lo = vqmovun_s16(b_lo);
+ const uint8x8_t pred_hi = vqmovun_s16(b_hi);
+
+ vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
+ ypred_ptr += y_stride;
+ yleft += left_stride;
+ }
+ }
+ break;
+ }
+}
+
+void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride) {
+ const int mode = x->mode_info_context->mbmi.uv_mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x8_t v_expected_udc = vdup_n_u8(128);
+ uint8x8_t v_expected_vdc = vdup_n_u8(128);
+
+ if (shift) {
+ unsigned int average_u = 0;
+ unsigned int average_v = 0;
+ int expected_udc;
+ int expected_vdc;
+ if (x->up_available) {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
+ average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 8; ++i) {
+ average_u += uleft[0];
+ uleft += left_stride;
+ average_v += vleft[0];
+ vleft += left_stride;
+ }
+ }
+ shift += 2;
+ expected_udc = (average_u + (1 << (shift - 1))) >> shift;
+ expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
+ v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
+ v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
+ }
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_expected_udc);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_expected_vdc);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_uabove);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vabove);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
+ uleft += left_stride;
+ vleft += left_stride;
+ vst1_u8(upred_ptr, v_uleft);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vleft);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
+ const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
+ const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
+ const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
+ const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
+ vreinterpretq_s16_u16(v_utop_left));
+ const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
+ vreinterpretq_s16_u16(v_vtop_left));
+ const uint8x8_t pred_u = vqmovun_s16(b_u);
+ const uint8x8_t pred_v = vqmovun_s16(b_v);
+
+ vst1_u8(upred_ptr, pred_u);
+ vst1_u8(vpred_ptr, pred_v);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ uleft += left_stride;
+ vleft += left_stride;
+ }
+ }
+ break;
+ }
+}
diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
deleted file mode 100644
index adc5b7e..0000000
--- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-;-----------------
-
- EXPORT |vp8_sub_pixel_variance16x16_neon_func|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
-
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-|vp8_sub_pixel_variance16x16_neon_func| PROC
- push {r4-r6, lr}
- vpush {d8-d15}
-
- adr r12, bilinear_taps_coeff
- ldr r4, [sp, #80] ;load *dst_ptr from stack
- ldr r5, [sp, #84] ;load dst_pixels_per_line from stack
- ldr r6, [sp, #88] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- sub sp, sp, #256
- mov r3, sp
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5}, [r3]!
- vst1.u8 {d6, d7}, [r3]!
- vmov q11, q15
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_sp16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r3]! ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r3]!
- vst1.u8 {d18, d19}, [r3]!
- vst1.u8 {d20, d21}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
- mov r3, sp
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r3]!
- vmov q11, q15
- vst1.u8 {d6, d7}, [r3]!
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r12, #8
-
-sub_pixel_variance16x16_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q2}, [r4], r5
- vld1.8 {q1}, [r3]!
- vld1.8 {q3}, [r4], r5
-
- vsubl.u8 q11, d0, d4 ;diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne sub_pixel_variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r6] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- add sp, sp, #528
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4-r6,pc}
-
- ENDP
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
deleted file mode 100644
index b0829af..0000000
--- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,583 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance_halfpixvar16x16_h_neon|
- EXPORT |vp8_variance_halfpixvar16x16_v_neon|
- EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
- EXPORT |vp8_sub_pixel_variance16x16s_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_h_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_h_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- mov r12, #4 ;loop counter
- ldr lr, [sp, #68] ;load *sse from stack
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.8 {q11}, [r2], r3
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.8 {q12}, [r2], r3
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.8 {q13}, [r2], r3
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vext.8 q3, q2, q3, #1
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vld1.8 {q14}, [r2], r3
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
-
- vsubl.u8 q4, d0, d22 ;diff
- vsubl.u8 q5, d1, d23
- vsubl.u8 q6, d2, d24
- vsubl.u8 q7, d3, d25
- vsubl.u8 q0, d4, d26
- vsubl.u8 q1, d5, d27
- vsubl.u8 q2, d6, d28
- vsubl.u8 q3, d7, d29
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_fpo16x16s_4_0_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_v_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_v_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- mov r12, #4 ;loop counter
-
- vld1.u8 {q0}, [r0], r1 ;load src data
- ldr lr, [sp, #68] ;load *sse from stack
-
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
- vld1.u8 {q2}, [r0], r1
- vld1.8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vld1.8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vld1.8 {q5}, [r2], r3
- vld1.u8 {q15}, [r0], r1
-
- vrhadd.u8 q0, q0, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q4
- vrhadd.u8 q4, q4, q6
- vrhadd.u8 q6, q6, q15
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
-
- vmov q0, q15
-
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_spo16x16s_0_4_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_hv_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_hv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
-
- ldr lr, [sp, #68] ;load *sse from stack
- vmov.i8 q13, #0 ;q8 - sum
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
-
- vmov.i8 q14, #0 ;q9, q10 - sse
- vmov.i8 q15, #0
-
- mov r12, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vld1.8 {q5}, [r2], r3
- vrhadd.u8 q0, q0, q1
- vld1.8 {q6}, [r2], r3
- vrhadd.u8 q1, q1, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q3
- vld1.8 {q8}, [r2], r3
- vrhadd.u8 q3, q3, q4
-
- vsubl.u8 q9, d0, d10 ;diff
- vsubl.u8 q10, d1, d11
- vsubl.u8 q11, d2, d12
- vsubl.u8 q12, d3, d13
-
- vsubl.u8 q0, d4, d14 ;diff
- vsubl.u8 q1, d5, d15
- vsubl.u8 q5, d6, d16
- vsubl.u8 q6, d7, d17
-
- vpadal.s16 q13, q9 ;sum
- vmlal.s16 q14, d18, d18 ;sse
- vmlal.s16 q15, d19, d19
-
- vpadal.s16 q13, q10 ;sum
- vmlal.s16 q14, d20, d20 ;sse
- vmlal.s16 q15, d21, d21
-
- vpadal.s16 q13, q11 ;sum
- vmlal.s16 q14, d22, d22 ;sse
- vmlal.s16 q15, d23, d23
-
- vpadal.s16 q13, q12 ;sum
- vmlal.s16 q14, d24, d24 ;sse
- vmlal.s16 q15, d25, d25
-
- subs r12, r12, #1
-
- vpadal.s16 q13, q0 ;sum
- vmlal.s16 q14, d0, d0 ;sse
- vmlal.s16 q15, d1, d1
-
- vpadal.s16 q13, q1 ;sum
- vmlal.s16 q14, d2, d2 ;sse
- vmlal.s16 q15, d3, d3
-
- vpadal.s16 q13, q5 ;sum
- vmlal.s16 q14, d10, d10 ;sse
- vmlal.s16 q15, d11, d11
-
- vmov q0, q4
-
- vpadal.s16 q13, q6 ;sum
- vmlal.s16 q14, d12, d12 ;sse
- vmlal.s16 q15, d13, d13
-
- bne vp8_filt16x16s_4_4_loop_neon
-
- vadd.u32 q15, q14, q15 ;accumulate sse
- vpaddl.s32 q0, q13 ;accumulate sum
-
- vpaddl.u32 q1, q15
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;==============================
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp8_sub_pixel_variance16x16s_neon| PROC
- push {r4, lr}
- vpush {d8-d15}
-
- ldr r4, [sp, #72] ;load *dst_ptr from stack
- ldr r12, [sp, #76] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #80] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16s_only
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq firstpass_bfilter16x16s_only
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- mov r3, sp
- mov r2, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q1, q1, q2
- vrhadd.u8 q2, q2, q3
- vrhadd.u8 q3, q3, q4
-
- subs r2, r2, #1
- vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
- vmov q0, q4
- vst1.u8 {d4, d5, d6, d7}, [r3]!
-
- bne vp8e_filt_blk2d_fp16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
- mov r2, #2 ;loop counter
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
- vext.8 q3, q2, q3, #1
- vld1.u8 {d20, d21, d22, d23}, [r0], r1
- vext.8 q5, q4, q5, #1
- vld1.u8 {d24, d25, d26, d27}, [r0], r1
- vext.8 q7, q6, q7, #1
- vld1.u8 {d28, d29, d30, d31}, [r0], r1
- vext.8 q9, q8, q9, #1
- vext.8 q11, q10, q11, #1
- vext.8 q13, q12, q13, #1
- vext.8 q15, q14, q15, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
- vrhadd.u8 q5, q10, q11
- vrhadd.u8 q6, q12, q13
- vrhadd.u8 q7, q14, q15
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]!
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
- sub sp, sp, #256 ;reserve space on stack for temporary storage
-
- mov r2, #2 ;loop counter
- vld1.u8 {d0, d1}, [r0], r1 ;load src data
- mov r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
- vld1.u8 {d2, d3}, [r0], r1
- vld1.u8 {d4, d5}, [r0], r1
- vld1.u8 {d6, d7}, [r0], r1
- vld1.u8 {d8, d9}, [r0], r1
-
- vrhadd.u8 q0, q0, q1
- vld1.u8 {d10, d11}, [r0], r1
- vrhadd.u8 q1, q1, q2
- vld1.u8 {d12, d13}, [r0], r1
- vrhadd.u8 q2, q2, q3
- vld1.u8 {d14, d15}, [r0], r1
- vrhadd.u8 q3, q3, q4
- vld1.u8 {d16, d17}, [r0], r1
- vrhadd.u8 q4, q4, q5
- vrhadd.u8 q5, q5, q6
- vrhadd.u8 q6, q6, q7
- vrhadd.u8 q7, q7, q8
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vmov q0, q8
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r2, #4
-
-sub_pixel_variance16x16s_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q1}, [r4], r12
- vld1.8 {q2}, [r3]!
- vld1.8 {q3}, [r4], r12
- vld1.8 {q4}, [r3]!
- vld1.8 {q5}, [r4], r12
- vld1.8 {q6}, [r3]!
- vld1.8 {q7}, [r4], r12
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r2, r2, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne sub_pixel_variance16x16s_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- add sp, sp, #256
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4, pc}
- ENDP
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
deleted file mode 100644
index 9d9f9e0..0000000
--- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sub_pixel_variance8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
-
-|vp8_sub_pixel_variance8x8_neon| PROC
- push {r4-r5, lr}
- vpush {d8-d15}
-
- adr r12, bilinear_taps_coeff
- ldr r4, [sp, #76] ;load *dst_ptr from stack
- ldr r5, [sp, #80] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #84] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- ;skip_secondpass_filter
- beq sub_pixel_variance8x8_neon
-
- add r3, r12, r3, lsl #3
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q2, #7
- vqrshrn.u16 d24, q3, #7
- vqrshrn.u16 d25, q4, #7
- vqrshrn.u16 d26, q5, #7
- vqrshrn.u16 d27, q6, #7
- vqrshrn.u16 d28, q7, #7
- vqrshrn.u16 d29, q8, #7
-
- b sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;----------------------
-;vp8_variance8x8_neon
-sub_pixel_variance8x8_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-sub_pixel_variance8x8_neon_loop
- vld1.8 {d0}, [r4], r5 ;load dst data
- subs r12, r12, #1
- vld1.8 {d1}, [r4], r5
- vld1.8 {d2}, [r4], r5
- vsubl.u8 q4, d22, d0 ;calculate diff
- vld1.8 {d3}, [r4], r5
-
- vsubl.u8 q5, d23, d1
- vsubl.u8 q6, d24, d2
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- vsubl.u8 q7, d25, d3
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
-
- vmov q11, q13
-
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
-
- vmov q12, q14
-
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- bne sub_pixel_variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #6
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4-r5, pc}
-
- ENDP
-
-;-----------------
-
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
new file mode 100644
index 0000000..6405bf2
--- /dev/null
+++ b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -0,0 +1,1028 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+static const uint16_t bilinear_taps_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+unsigned int vp8_sub_pixel_variance16x16_neon_func(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int i;
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+ unsigned char *tmpp;
+ unsigned char *tmpp2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ tmpp2 = tmp + 272;
+ tmpp = tmp;
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ } else if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp2, q7u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q8u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q9u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q10u8);
+ tmpp2 += 16;
+ }
+ } else {
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8);
+ tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ tmpp2 = tmpp + 272;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q13u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q14u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q15u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ }
+
+ // sub_pixel_variance16x16_neon
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ tmpp = tmp + 272;
+ for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop
+ q0u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q1u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q2u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+ q3u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_h_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
+ uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q11u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q12u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q13u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q14u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
+ q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
+ q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
+ q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
+ q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
+ q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
+
+ d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
+ d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
+ q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
+ q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
+ q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
+ q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
+ q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
+ q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
+ d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
+ d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
+ q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
+ q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_v_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+
+ q1u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q4u8);
+ q4u8 = vrhaddq_u8(q4u8, q6u8);
+ q6u8 = vrhaddq_u8(q6u8, q15u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d8u8 = vget_low_u8(q4u8);
+ d9u8 = vget_high_u8(q4u8);
+ d12u8 = vget_low_u8(q6u8);
+ d13u8 = vget_high_u8(q6u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
+ q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
+ q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8));
+ q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8));
+ q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8));
+ q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+
+ q0u8 = q15u8;
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_hv_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
+ int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
+ int32x4_t q13s32, q14s32, q15s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q13s32 = vdupq_n_s32(0);
+ q14s32 = vdupq_n_s32(0);
+ q15s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q8u8 = vld1q_u8(src_ptr);
+ q9u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+ q9u8 = vextq_u8(q8u8, q9u8, 1);
+
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+ q4u8 = vrhaddq_u8(q8u8, q9u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q1u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q3u8);
+ q3u8 = vrhaddq_u8(q3u8, q4u8);
+
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q6u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q8u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8));
+ q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
+ q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
+ q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8));
+ q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8));
+ q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
+ q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
+ q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
+ q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
+ q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
+ q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
+ q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
+ q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
+
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
+ q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
+ q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
+
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
+ q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
+
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
+ q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
+ q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
+
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
+ q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
+ q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
+
+ q0u8 = q4u8;
+ }
+
+ q15s32 = vaddq_s32(q14s32, q15s32);
+ q0s64 = vpaddlq_s32(q13s32);
+ q1s64 = vpaddlq_s32(q15s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+enum { kWidth8 = 8 };
+enum { kHeight8 = 8 };
+enum { kHeight8PlusOne = 9 };
+enum { kPixelStepOne = 1 };
+enum { kAlign16 = 16 };
+
+#define FILTER_BITS 7
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint16_t *vpx_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp8_sub_pixel_variance8x8_neon(
+ const unsigned char *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+ if (xoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
+ kWidth8, bilinear_taps_coeff[yoffset]);
+ } else if (yoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
+ kHeight8PlusOne, kWidth8,
+ bilinear_taps_coeff[xoffset]);
+ } else {
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
+ kHeight8PlusOne, kWidth8,
+ bilinear_taps_coeff[xoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
+ kWidth8, bilinear_taps_coeff[yoffset]);
+ }
+ return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+}
+
diff --git a/source/libvpx/vp8/common/arm/reconintra_arm.c b/source/libvpx/vp8/common/arm/reconintra_arm.c
deleted file mode 100644
index e55a33c..0000000
--- a/source/libvpx/vp8/common/arm/reconintra_arm.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_NEON_ASM
-extern void vp8_build_intra_predictors_mby_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
-{
- unsigned char *y_buffer = x->dst.y_buffer;
- unsigned char *ypred_ptr = x->predictor;
- int y_stride = x->dst.y_stride;
- int mode = x->mode_info_context->mbmi.mode;
- int Up = x->up_available;
- int Left = x->left_available;
-
- vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-extern void vp8_build_intra_predictors_mby_s_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
-{
- unsigned char *y_buffer = x->dst.y_buffer;
- unsigned char *ypred_ptr = x->predictor;
- int y_stride = x->dst.y_stride;
- int mode = x->mode_info_context->mbmi.mode;
- int Up = x->up_available;
- int Left = x->left_available;
-
- vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-#endif
diff --git a/source/libvpx/vp8/common/arm/variance_arm.c b/source/libvpx/vp8/common/arm/variance_arm.c
index e3f7083..467a509 100644
--- a/source/libvpx/vp8/common/arm/variance_arm.c
+++ b/source/libvpx/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
#endif /* HAVE_MEDIA */
-#if HAVE_NEON_ASM
+#if HAVE_NEON
extern unsigned int vp8_sub_pixel_variance16x16_neon_func
(
diff --git a/source/libvpx/vp8/common/onyx.h b/source/libvpx/vp8/common/onyx.h
index b05ad14..d48c4fe 100644
--- a/source/libvpx/vp8/common/onyx.h
+++ b/source/libvpx/vp8/common/onyx.h
@@ -224,7 +224,7 @@ extern "C"
int arnr_strength;
int arnr_type;
- struct vpx_fixed_buf two_pass_stats_in;
+ vpx_fixed_buf_t two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
vp8e_tuning tuning;
diff --git a/source/libvpx/vp8/common/rtcd_defs.pl b/source/libvpx/vp8/common/rtcd_defs.pl
index 204cbf0..a90c876 100644
--- a/source/libvpx/vp8/common/rtcd_defs.pl
+++ b/source/libvpx/vp8/common/rtcd_defs.pl
@@ -38,15 +38,13 @@ $vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
-$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
-$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
#
@@ -58,9 +56,8 @@ $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
-$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -69,19 +66,18 @@ $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
-$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
-$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
@@ -92,12 +88,12 @@ $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
-$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
@@ -153,11 +149,10 @@ $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
-#TODO: fix assembly for neon
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/;
add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/;
add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
specialize qw/vp8_intra4x4_predict media/;
@@ -446,14 +441,12 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
-$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
-$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_walsh4x4 sse2 media neon/;
@@ -537,13 +530,6 @@ if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
}
#
-# Pick Loopfilter
-#
-add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
-$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
-
-#
# Denoiser filter
#
if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
@@ -551,7 +537,6 @@ if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
specialize qw/vp8_denoiser_filter sse2 neon/;
add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
specialize qw/vp8_denoiser_filter_uv sse2 neon/;
-
}
# End of encoder only functions
diff --git a/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
deleted file mode 100644
index ec8071e..0000000
--- a/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/loopfilter.h"
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
- unsigned char *src_ptr,
- int sz);
-
-
-void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc)
-{
- unsigned char *src_y, *dst_y;
- int yheight;
- int ystride;
- int yoffset;
- int linestocopy;
-
- yheight = src_ybc->y_height;
- ystride = src_ybc->y_stride;
-
- /* number of MB rows to use in partial filtering */
- linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
- linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
-
- /* Copy extra 4 so that full filter context is available if filtering done
- * on the copied partial frame and not original. Partial filter does mb
- * filtering for top row also, which can modify3 pixels above.
- */
- linestocopy += 4;
- /* partial image starts at ~middle of frame (macroblock border) */
- yoffset = ystride * (((yheight >> 5) * 16) - 4);
- src_y = src_ybc->y_buffer + yoffset;
- dst_y = dst_ybc->y_buffer + yoffset;
-
- vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
-}
diff --git a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
deleted file mode 100644
index 5ea8dd8..0000000
--- a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_fdct4x4_neon|
- EXPORT |vp8_short_fdct8x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
- ALIGN 16 ; enable use of @128 bit aligned loads
-coeff
- DCW 5352, 5352, 5352, 5352
- DCW 2217, 2217, 2217, 2217
- DCD 14500, 14500, 14500, 14500
- DCD 7500, 7500, 7500, 7500
- DCD 12000, 12000, 12000, 12000
- DCD 51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
- ; Part one
- vld1.16 {d0}, [r0@64], r2
- adr r12, coeff
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {d2}, [r0@64], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {d3}, [r0@64], r2
-
- ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
- vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
- vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q2, q2, #3 ; (a1, b1) << 3
- vshl.s16 q3, q3, #3 ; (c1, d1) << 3
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
- vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
-
- vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
- vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
- vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
-
- ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vmov.s16 d26, #7
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
- vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
- vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
- vadd.s16 d4, d4, d26 ; a1 + 7
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
- vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
-
- vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
- vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
-
- vceq.s16 d4, d7, #0
-
- vshr.s16 d0, d0, #4
- vshr.s16 d2, d2, #4
-
- vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
-
- vmvn d4, d4
- vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
- vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
- vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
- ; Part one
-
- vld1.16 {q0}, [r0@128], r2
- adr r12, coeff
- vld1.16 {q1}, [r0@128], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {q2}, [r0@128], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {q3}, [r0@128], r2
-
- ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
- vtrn.32 q0, q2 ; [A0|B0]
- vtrn.32 q1, q3 ; [A1|B1]
- vtrn.16 q0, q1 ; [A2|B2]
- vtrn.16 q2, q3 ; [A3|B3]
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
- vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
- vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q11, q11, #3 ; a1 << 3
- vshl.s16 q12, q12, #3 ; b1 << 3
- vshl.s16 q13, q13, #3 ; c1 << 3
- vshl.s16 q14, q14, #3 ; d1 << 3
-
- vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
- vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
-
- vmov.s16 q11, q9 ; 14500
- vmov.s16 q12, q10 ; 7500
-
- vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
- vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
- vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
- vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
-
- vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
- vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
- vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
- vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
-
- ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
- vtrn.32 q0, q2 ; q0=[A0 | B0]
- vtrn.32 q1, q3 ; q1=[A4 | B4]
- vtrn.16 q0, q1 ; q2=[A8 | B8]
- vtrn.16 q2, q3 ; q3=[A12|B12]
-
- vmov.s16 q15, #7
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
- vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
- vadd.s16 q11, q11, q15 ; a1 + 7
- vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 q0, q11, q12 ; a1 + b1 + 7
- vsub.s16 q1, q11, q12 ; a1 - b1 + 7
-
- vmov.s16 q11, q9 ; 12000
- vmov.s16 q12, q10 ; 51000
-
- vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
- vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
- vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
- vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
-
-
- vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
- vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
- vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
- vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
-
- vceq.s16 q14, q14, #0
-
- vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
- vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
-
- vmvn q14, q14
-
- vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
-
- vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
-
- vst1.16 {q0, q1}, [r1@128]! ; block A
- vst1.16 {q2, q3}, [r1@128]! ; block B
-
- bx lr
-
- ENDP
-
- END
-
diff --git a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000..391e5f9
--- /dev/null
+++ b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+ uint16x4_t d4u16;
+ int16x8_t q0s16, q1s16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+ q11s32 = vdupq_n_s32(12000);
+ q12s32 = vdupq_n_s32(51000);
+
+ // Part one
+ pitch >>= 1;
+ d0s16 = vld1_s16(input);
+ input += pitch;
+ d1s16 = vld1_s16(input);
+ input += pitch;
+ d2s16 = vld1_s16(input);
+ input += pitch;
+ d3s16 = vld1_s16(input);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d4s16 = vshl_n_s16(d4s16, 3);
+ d5s16 = vshl_n_s16(d5s16, 3);
+ d6s16 = vshl_n_s16(d6s16, 3);
+ d7s16 = vshl_n_s16(d7s16, 3);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+ q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 12);
+ d3s16 = vshrn_n_s32(q10s32, 12);
+
+ // Part two
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d26s16 = vdup_n_s16(7);
+ d4s16 = vadd_s16(d4s16, d26s16);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+ dEmptys16 = vdup_n_s16(0);
+ d4u16 = vceq_s16(d7s16, dEmptys16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+
+ q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+ d4u16 = vmvn_u16(d4u16);
+ d1s16 = vshrn_n_s32(q11s32, 16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+ d3s16 = vshrn_n_s32(q12s32, 16);
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ return;
+}
+
+void vp8_short_fdct8x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+ uint16x4_t d28u16, d29u16;
+ uint16x8_t q14u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x8x2_t v2tmp0, v2tmp1;
+ int32x4x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+
+ // Part one
+ pitch >>= 1;
+ q0s16 = vld1q_s16(input);
+ input += pitch;
+ q1s16 = vld1q_s16(input);
+ input += pitch;
+ q2s16 = vld1q_s16(input);
+ input += pitch;
+ q3s16 = vld1q_s16(input);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q11s16 = vshlq_n_s16(q11s16, 3);
+ q12s16 = vshlq_n_s16(q12s16, 3);
+ q13s16 = vshlq_n_s16(q13s16, 3);
+ q14s16 = vshlq_n_s16(q14s16, 3);
+
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q2s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d2s16 = vshrn_n_s32(q9s32, 12);
+ d6s16 = vshrn_n_s32(q10s32, 12);
+ d3s16 = vshrn_n_s32(q11s32, 12);
+ d7s16 = vshrn_n_s32(q12s32, 12);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ // Part two
+ q9s32 = vdupq_n_s32(12000);
+ q10s32 = vdupq_n_s32(51000);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q15s16 = vdupq_n_s16(7);
+ q11s16 = vaddq_s16(q11s16, q15s16);
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q1s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d2s16 = vget_low_s16(q1s16);
+ d3s16 = vget_high_s16(q1s16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d4s16 = vshr_n_s16(d1s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+ d6s16 = vshr_n_s16(d3s16, 4);
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 16);
+ d3s16 = vshrn_n_s32(q10s32, 16);
+ d5s16 = vshrn_n_s32(q11s32, 16);
+ d7s16 = vshrn_n_s32(q12s32, 16);
+
+ qEmptys16 = vdupq_n_s16(0);
+ q14u16 = vceqq_s16(q14s16, qEmptys16);
+ q14u16 = vmvnq_u16(q14u16);
+
+ d28u16 = vget_low_u16(q14u16);
+ d29u16 = vget_high_u16(q14u16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+ d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ vst1q_s16(output + 16, q2s16);
+ vst1q_s16(output + 24, q3s16);
+ return;
+}
diff --git a/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
deleted file mode 100644
index d219e2d..0000000
--- a/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_memcpy_partial_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;this is not a full memcpy function!!!
-;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
-; int sz);
-|vp8_memcpy_partial_neon| PROC
- vpush {d8-d15}
- ;pld [r1] ;preload pred data
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- mov r12, r2, lsr #8 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
- vld1.8 {q0, q1}, [r1]! ;load src data
- subs r12, r12, #1
- vld1.8 {q2, q3}, [r1]!
- vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
- vld1.8 {q4, q5}, [r1]!
- vst1.8 {q2, q3}, [r0]!
- vld1.8 {q6, q7}, [r1]!
- vst1.8 {q4, q5}, [r0]!
- vld1.8 {q8, q9}, [r1]!
- vst1.8 {q6, q7}, [r0]!
- vld1.8 {q10, q11}, [r1]!
- vst1.8 {q8, q9}, [r0]!
- vld1.8 {q12, q13}, [r1]!
- vst1.8 {q10, q11}, [r0]!
- vld1.8 {q14, q15}, [r1]!
- vst1.8 {q12, q13}, [r0]!
- vst1.8 {q14, q15}, [r0]!
-
- ;pld [r1] ;preload pred data -- need to adjust for real device
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- bne memcpy_neon_loop
-
- ands r3, r2, #0xff ;extra copy
- beq done_copy_neon_loop
-
-extra_copy_neon_loop
- vld1.8 {q0}, [r1]! ;load src data
- subs r3, r3, #16
- vst1.8 {q0}, [r0]!
- bne extra_copy_neon_loop
-
-done_copy_neon_loop
- vpop {d8-d15}
- bx lr
- ENDP
-
- END
diff --git a/source/libvpx/vp8/encoder/denoising.c b/source/libvpx/vp8/encoder/denoising.c
index 2da0d8c..0c98eb1 100644
--- a/source/libvpx/vp8/encoder/denoising.c
+++ b/source/libvpx/vp8/encoder/denoising.c
@@ -413,9 +413,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
denoiser->nmse_source_diff = 0;
denoiser->nmse_source_diff_count = 0;
// TODO(marpan): Adjust thresholds, including effect on resolution.
- denoiser->threshold_aggressive_mode = 40;
+ denoiser->threshold_aggressive_mode = 35;
if (width * height > 640 * 480)
- denoiser->threshold_aggressive_mode = 180;
+ denoiser->threshold_aggressive_mode = 150;
+ else if (width * height > 1280 * 720)
+ denoiser->threshold_aggressive_mode = 1400;
return 0;
}
diff --git a/source/libvpx/vp8/encoder/onyx_if.c b/source/libvpx/vp8/encoder/onyx_if.c
index 38b8999..74e75c4 100644
--- a/source/libvpx/vp8/encoder/onyx_if.c
+++ b/source/libvpx/vp8/encoder/onyx_if.c
@@ -3293,6 +3293,7 @@ static void update_reference_frames(VP8_COMP *cpi)
}
+#if CONFIG_TEMPORAL_DENOISING
static void process_denoiser_mode_change(VP8_COMP *cpi) {
const VP8_COMMON *const cm = &cpi->common;
int i, j;
@@ -3399,6 +3400,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
cpi->denoiser.nmse_source_diff_count = 0;
}
}
+#endif
void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
{
diff --git a/source/libvpx/vp8/encoder/pickinter.c b/source/libvpx/vp8/encoder/pickinter.c
index 8dd1881..43f8957 100644
--- a/source/libvpx/vp8/encoder/pickinter.c
+++ b/source/libvpx/vp8/encoder/pickinter.c
@@ -487,6 +487,7 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
int this_rd;
+ int denoise_aggressive = 0;
/* Exit early and don't compute the distortion if this macroblock
* is marked inactive. */
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -505,10 +506,17 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ denoise_aggressive =
+ (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+ }
+#endif
+
// Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
if (this_mode == ZEROMV &&
x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
- cpi->closest_reference_frame == LAST_FRAME)
+ (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
{
this_rd = ((int64_t)this_rd) * rd_adj / 100;
}
diff --git a/source/libvpx/vp8/encoder/picklpf.c b/source/libvpx/vp8/encoder/picklpf.c
index 250d04c..f0c8f28 100644
--- a/source/libvpx/vp8/encoder/picklpf.c
+++ b/source/libvpx/vp8/encoder/picklpf.c
@@ -23,8 +23,8 @@
extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc)
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc)
{
unsigned char *src_y, *dst_y;
int yheight;
@@ -173,7 +173,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
/* Get the err using the previous frame's filter value. */
/* Copy the unfiltered / processed recon buffer to the new buffer */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
@@ -184,7 +184,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
while (filt_val >= min_filter_level)
{
/* Apply the loop filter */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
/* Get the err for filtered frame */
@@ -214,7 +214,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
while (filt_val < max_filter_level)
{
/* Apply the loop filter */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
diff --git a/source/libvpx/vp8/vp8_common.mk b/source/libvpx/vp8/vp8_common.mk
index 6db031f..9b11c0d 100644
--- a/source/libvpx/vp8/vp8_common.mk
+++ b/source/libvpx/vp8/vp8_common.mk
@@ -155,30 +155,25 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
-# common (neon)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-
# common (neon intrinsics)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/source/libvpx/vp8/vp8_cx_iface.c b/source/libvpx/vp8/vp8_cx_iface.c
index 2f394ef..b1b079c 100644
--- a/source/libvpx/vp8/vp8_cx_iface.c
+++ b/source/libvpx/vp8/vp8_cx_iface.c
@@ -14,6 +14,7 @@
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_version.h"
+#include "vpx_mem/vpx_mem.h"
#include "vp8/encoder/onyx_int.h"
#include "vpx/vp8cx.h"
#include "vp8/encoder/firstpass.h"
@@ -39,40 +40,28 @@ struct vp8_extracfg
};
-struct extraconfig_map
-{
- int usage;
- struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] =
-{
- {
- 0,
- {
- NULL,
+static struct vp8_extracfg default_extracfg = {
+ NULL,
#if !(CONFIG_REALTIME_ONLY)
- 0, /* cpu_used */
+ 0, /* cpu_used */
#else
- 4, /* cpu_used */
+ 4, /* cpu_used */
#endif
- 0, /* enable_auto_alt_ref */
- 0, /* noise_sensitivity */
- 0, /* Sharpness */
- 0, /* static_thresh */
+ 0, /* enable_auto_alt_ref */
+ 0, /* noise_sensitivity */
+ 0, /* Sharpness */
+ 0, /* static_thresh */
#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
- VP8_EIGHT_TOKENPARTITION,
+ VP8_EIGHT_TOKENPARTITION,
#else
- VP8_ONE_TOKENPARTITION, /* token_partitions */
+ VP8_ONE_TOKENPARTITION, /* token_partitions */
#endif
- 0, /* arnr_max_frames */
- 3, /* arnr_strength */
- 3, /* arnr_type*/
- 0, /* tuning*/
- 10, /* cq_level */
- 0, /* rc_max_intra_bitrate_pct */
- }
- }
+ 0, /* arnr_max_frames */
+ 3, /* arnr_strength */
+ 3, /* arnr_type*/
+ 0, /* tuning*/
+ 10, /* cq_level */
+ 0, /* rc_max_intra_bitrate_pct */
};
struct vpx_codec_alg_priv
@@ -631,26 +620,21 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
{
vpx_codec_err_t res = VPX_CODEC_OK;
- struct vpx_codec_alg_priv *priv;
- vpx_codec_enc_cfg_t *cfg;
- unsigned int i;
- struct VP8_COMP *optr;
vp8_rtcd();
if (!ctx->priv)
{
- priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+ struct vpx_codec_alg_priv *priv =
+ (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv));
if (!priv)
{
return VPX_CODEC_MEM_ERROR;
}
- ctx->priv = &priv->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->alg_priv = priv;
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
if (ctx->config.enc)
@@ -658,21 +642,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
/* Update the reference to the config structure to an
* internal copy.
*/
- ctx->priv->alg_priv->cfg = *ctx->config.enc;
- ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
}
- cfg = &ctx->priv->alg_priv->cfg;
-
- /* Select the extra vp8 configuration table based on the current
- * usage value. If the current usage value isn't found, use the
- * values for usage case 0.
- */
- for (i = 0;
- extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- i++);
-
- priv->vp8_cfg = extracfg_map[i].cfg;
+ priv->vp8_cfg = default_extracfg;
priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
@@ -695,17 +669,10 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
if (!res)
{
- set_vp8e_config(&ctx->priv->alg_priv->oxcf,
- ctx->priv->alg_priv->cfg,
- ctx->priv->alg_priv->vp8_cfg,
- mr_cfg);
-
- optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
-
- if (!optr)
+ set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
+ priv->cpi = vp8_create_compressor(&priv->oxcf);
+ if (!priv->cpi)
res = VPX_CODEC_MEM_ERROR;
- else
- ctx->priv->alg_priv->cpi = optr;
}
}
@@ -726,7 +693,7 @@ static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
free(ctx->cx_data);
vp8_remove_compressor(&ctx->cpi);
- free(ctx);
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
@@ -1278,6 +1245,9 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
320, /* g_width */
240, /* g_height */
+ VPX_BITS_8, /* g_bit_depth */
+ 8, /* g_input_bit_depth */
+
{1, 30}, /* g_timebase */
0, /* g_error_resilient */
@@ -1346,10 +1316,10 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
{
- NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
- NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
- NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
- NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
+ NULL, /* vpx_codec_peek_si_fn_t peek_si; */
+ NULL, /* vpx_codec_get_si_fn_t get_si; */
+ NULL, /* vpx_codec_decode_fn_t decode; */
+ NULL, /* vpx_codec_frame_get_fn_t frame_get; */
},
{
1, /* 1 cfg map */
@@ -1357,7 +1327,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
vp8e_encode, /* vpx_codec_encode_fn_t encode; */
vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
vp8e_set_config,
- NOT_IMPLEMENTED,
+ NULL,
vp8e_get_preview,
vp8e_mr_alloc_mem,
} /* encoder functions */
diff --git a/source/libvpx/vp8/vp8_dx_iface.c b/source/libvpx/vp8/vp8_dx_iface.c
index 0deda50..3ab8ed0 100644
--- a/source/libvpx/vp8/vp8_dx_iface.c
+++ b/source/libvpx/vp8/vp8_dx_iface.c
@@ -80,29 +80,30 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
{
- ctx->priv =
- (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
- vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
- ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
- ctx->priv->alg_priv->decrypt_cb = NULL;
- ctx->priv->alg_priv->decrypt_state = NULL;
- ctx->priv->alg_priv->flushed = 0;
+ vpx_codec_alg_priv_t *priv =
+ (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
+ priv->si.sz = sizeof(priv->si);
+ priv->decrypt_cb = NULL;
+ priv->decrypt_state = NULL;
+ priv->flushed = 0;
+
if (ctx->config.dec)
{
/* Update the reference to the config structure to an internal copy. */
- ctx->priv->alg_priv->cfg = *ctx->config.dec;
- ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
}
}
static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data)
{
- vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_alg_priv_t *priv = NULL;
(void) data;
vp8_rtcd();
@@ -114,29 +115,30 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
if (!ctx->priv)
{
vp8_init_ctx(ctx);
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
/* initialize number of fragments to zero */
- ctx->priv->alg_priv->fragments.count = 0;
+ priv->fragments.count = 0;
/* is input fragments enabled? */
- ctx->priv->alg_priv->fragments.enabled =
- (ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_INPUT_FRAGMENTS);
+ priv->fragments.enabled =
+ (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
/*post processing level initialized to do nothing */
}
+ else
+ {
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
+ }
- ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
- (ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_FRAME_THREADING);
+ priv->yv12_frame_buffers.use_frame_threads =
+ (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
/* for now, disable frame threading */
- ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0;
+ priv->yv12_frame_buffers.use_frame_threads = 0;
- if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads &&
- (( ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_ERROR_CONCEALMENT)
- || ( ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_INPUT_FRAGMENTS) ) )
+ if (priv->yv12_frame_buffers.use_frame_threads &&
+ ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+ (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS)))
{
/* row-based threading, error concealment, and input fragments will
* not be supported when using frame-based threading */
@@ -814,15 +816,15 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
vp8_decode, /* vpx_codec_decode_fn_t decode; */
vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
- NOT_IMPLEMENTED,
+ NULL,
},
{ /* encoder functions */
0,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL
}
};
diff --git a/source/libvpx/vp8/vp8cx_arm.mk b/source/libvpx/vp8/vp8cx_arm.mk
index 0b3eed0..551271e 100644
--- a/source/libvpx/vp8/vp8cx_arm.mk
+++ b/source/libvpx/vp8/vp8cx_arm.mk
@@ -36,11 +36,9 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
# encoder
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
diff --git a/source/libvpx/vp9/common/vp9_alloccommon.c b/source/libvpx/vp9/common/vp9_alloccommon.c
index c65e008..21ae8d5 100644
--- a/source/libvpx/vp9/common/vp9_alloccommon.c
+++ b/source/libvpx/vp9/common/vp9_alloccommon.c
@@ -177,7 +177,11 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
for (i = 0; i < FRAME_BUFFERS; ++i) {
cm->frame_bufs[i].ref_count = 0;
if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
- ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
+ ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS) < 0)
goto fail;
}
@@ -185,6 +189,9 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS) < 0)
goto fail;
#endif
diff --git a/source/libvpx/vp9/common/vp9_common.h b/source/libvpx/vp9/common/vp9_common.h
index 2788e66..5587192 100644
--- a/source/libvpx/vp9/common/vp9_common.h
+++ b/source/libvpx/vp9/common/vp9_common.h
@@ -64,6 +64,11 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
return num_values > 0 ? get_msb(num_values) + 1 : 0;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(cm, lval, expr) do { \
lval = (expr); \
diff --git a/source/libvpx/vp9/common/vp9_enums.h b/source/libvpx/vp9/common/vp9_enums.h
index d776313..8817fdb 100644
--- a/source/libvpx/vp9/common/vp9_enums.h
+++ b/source/libvpx/vp9/common/vp9_enums.h
@@ -40,12 +40,6 @@ typedef enum BITSTREAM_PROFILE {
MAX_PROFILES
} BITSTREAM_PROFILE;
-typedef enum BIT_DEPTH {
- BITS_8,
- BITS_10,
- BITS_12
-} BIT_DEPTH;
-
typedef enum BLOCK_SIZE {
BLOCK_4X4,
BLOCK_4X8,
diff --git a/source/libvpx/vp9/common/vp9_mv.h b/source/libvpx/vp9/common/vp9_mv.h
index 3eb7f9d..5d89da8 100644
--- a/source/libvpx/vp9/common/vp9_mv.h
+++ b/source/libvpx/vp9/common/vp9_mv.h
@@ -34,6 +34,14 @@ typedef struct mv32 {
int32_t col;
} MV32;
+static INLINE int is_zero_mv(const MV *mv) {
+ return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+ return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
int min_row, int max_row) {
mv->col = clamp(mv->col, min_col, max_col);
diff --git a/source/libvpx/vp9/common/vp9_onyxc_int.h b/source/libvpx/vp9/common/vp9_onyxc_int.h
index 47aa563..637867a 100644
--- a/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/source/libvpx/vp9/common/vp9_onyxc_int.h
@@ -84,6 +84,10 @@ typedef struct VP9Common {
int subsampling_x;
int subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth; // Marks if we need to use 16bit frame buffers.
+#endif
+
YV12_BUFFER_CONFIG *frame_to_show;
RefCntBuffer frame_bufs[FRAME_BUFFERS];
@@ -179,8 +183,8 @@ typedef struct VP9Common {
unsigned int current_video_frame;
BITSTREAM_PROFILE profile;
- // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2
- BIT_DEPTH bit_depth;
+ // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+ vpx_bit_depth_t bit_depth;
#if CONFIG_VP9_POSTPROC
struct postproc_state postproc_state;
diff --git a/source/libvpx/vp9/common/vp9_postproc.c b/source/libvpx/vp9/common/vp9_postproc.c
index abda4e6..e4e6ce7 100644
--- a/source/libvpx/vp9/common/vp9_postproc.c
+++ b/source/libvpx/vp9/common/vp9_postproc.c
@@ -366,6 +366,9 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise,
unsigned int width, unsigned int height, int pitch) {
unsigned int i, j;
+ // TODO(jbb): why does simd code use both but c doesn't, normalize and
+ // fix..
+ (void) bothclamp;
for (i = 0; i < height; i++) {
uint8_t *pos = start + i * pitch;
char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
diff --git a/source/libvpx/vp9/common/vp9_reconintra.c b/source/libvpx/vp9/common/vp9_reconintra.c
index 403e105..471929a 100644
--- a/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/source/libvpx/vp9/common/vp9_reconintra.c
@@ -9,11 +9,9 @@
*/
#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
-
-#include "./vp9_rtcd.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_onyxc_int.h"
@@ -292,32 +290,32 @@ intra_pred_allsizes(dc)
typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left);
-static intra_pred_fn pred[INTRA_MODES][4];
-static intra_pred_fn dc_pred[2][2][4];
-
-static void init_intra_pred_fn_ptrs(void) {
-#define intra_pred_allsizes(l, type) \
- l[0] = vp9_##type##_predictor_4x4; \
- l[1] = vp9_##type##_predictor_8x8; \
- l[2] = vp9_##type##_predictor_16x16; \
- l[3] = vp9_##type##_predictor_32x32
-
- intra_pred_allsizes(pred[V_PRED], v);
- intra_pred_allsizes(pred[H_PRED], h);
- intra_pred_allsizes(pred[D207_PRED], d207);
- intra_pred_allsizes(pred[D45_PRED], d45);
- intra_pred_allsizes(pred[D63_PRED], d63);
- intra_pred_allsizes(pred[D117_PRED], d117);
- intra_pred_allsizes(pred[D135_PRED], d135);
- intra_pred_allsizes(pred[D153_PRED], d153);
- intra_pred_allsizes(pred[TM_PRED], tm);
-
- intra_pred_allsizes(dc_pred[0][0], dc_128);
- intra_pred_allsizes(dc_pred[0][1], dc_top);
- intra_pred_allsizes(dc_pred[1][0], dc_left);
- intra_pred_allsizes(dc_pred[1][1], dc);
-
-#undef intra_pred_allsizes
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
+
+void vp9_init_intra_predictors() {
+#define INIT_ALL_SIZES(p, type) \
+ p[TX_4X4] = vp9_##type##_predictor_4x4; \
+ p[TX_8X8] = vp9_##type##_predictor_8x8; \
+ p[TX_16X16] = vp9_##type##_predictor_16x16; \
+ p[TX_32X32] = vp9_##type##_predictor_32x32
+
+ INIT_ALL_SIZES(pred[V_PRED], v);
+ INIT_ALL_SIZES(pred[H_PRED], h);
+ INIT_ALL_SIZES(pred[D207_PRED], d207);
+ INIT_ALL_SIZES(pred[D45_PRED], d45);
+ INIT_ALL_SIZES(pred[D63_PRED], d63);
+ INIT_ALL_SIZES(pred[D117_PRED], d117);
+ INIT_ALL_SIZES(pred[D135_PRED], d135);
+ INIT_ALL_SIZES(pred[D153_PRED], d153);
+ INIT_ALL_SIZES(pred[TM_PRED], tm);
+
+ INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+ INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+ INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+ INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+#undef INIT_ALL_SIZES
}
static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
@@ -343,8 +341,6 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
// 129 G H .. S T T T T T
// ..
- once(init_intra_pred_fn_ptrs);
-
// Get current frame pointer, width and height.
if (plane == 0) {
frame_width = xd->cur_buf->y_width;
diff --git a/source/libvpx/vp9/common/vp9_reconintra.h b/source/libvpx/vp9/common/vp9_reconintra.h
index d09d2a1..845f3bc 100644
--- a/source/libvpx/vp9/common/vp9_reconintra.h
+++ b/source/libvpx/vp9/common/vp9_reconintra.h
@@ -18,6 +18,8 @@
extern "C" {
#endif
+void vp9_init_intra_predictors();
+
void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
TX_SIZE tx_size, PREDICTION_MODE mode,
const uint8_t *ref, int ref_stride,
diff --git a/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index 92f9318..667e057 100644
--- a/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -268,7 +268,7 @@ $vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
#
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down mmx sse2/;
+specialize qw/vp9_mbpost_proc_down sse2/;
$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
@@ -276,23 +276,14 @@ specialize qw/vp9_mbpost_proc_across_ip sse2/;
$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+specialize qw/vp9_post_proc_down_and_across sse2/;
$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise mmx sse2/;
+specialize qw/vp9_plane_add_noise sse2/;
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
}
-add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_inner/;
-
-add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_outer/;
-
-add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_b/;
-
#
# Sub Pixel Filters
#
@@ -420,19 +411,19 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
@@ -444,7 +435,7 @@ add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance4x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -545,16 +536,16 @@ add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_str
specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad8x4/, "$sse2_x86inc";
@@ -563,7 +554,7 @@ add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_strid
specialize qw/vp9_sad4x8/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
+specialize qw/vp9_sad4x4/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
@@ -693,19 +684,19 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, cons
specialize qw/vp9_sad4x4x4d sse/;
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/;
+specialize qw/vp9_mse8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/;
+specialize qw/vp9_mse16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/;
+specialize qw/vp9_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss mmx sse2/;
+specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
diff --git a/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm b/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm
deleted file mode 100644
index 5b8deef..0000000
--- a/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm
+++ /dev/null
@@ -1,533 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
-sym(vp9_post_proc_down_and_across_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movq mm0, [GLOBAL(rd)]
- sub rsp, 8
- movq [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
- push rbx
- lea rbx, [GLOBAL(Blur)]
- movd mm2, dword ptr arg(6) ;flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
-
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
- movq mm3, [rsi] ; mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
- movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
- pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = r0 p0..p3
- psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
- movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- neg rax
- movq mm6, [rbx ] ; kernel 0 taps
- movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16] ; kernel 1 taps
- movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
-
- movd [rdi], mm1 ;
- neg rax ; pitch is positive
-
-
- add rsi, 4
- add rdi, 4
- add rdx, 4
-
- cmp edx, dword ptr arg(5) ;cols
- jl .nextcol
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
-
- push rax
- xor rdx, rdx
- mov rax, [rdi-4];
-
-.acrossnextcol:
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ;
- movq mm4, [rdi+rdx] ; mm4 = p0..p7
- movq mm3, mm4 ; mm3 = p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48]
- psrlq mm4, 8 ; mm4 = p1..p7
- movq mm5, mm4 ; mm5 = p1..p7
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = p0..p3
- psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ]
- psrlq mm4, 8 ; mm4 = p2..p7
- movq mm5, mm4 ; mm5 = p2..p7
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- movq mm6, [rbx ]
- movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
- movq mm5, mm4 ; mm5 = p-2..p5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16]
- psrlq mm4, 8 ; mm4 = p-1..p5
- punpcklbw mm4, mm0 ; mm4 = p-1..p2
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
- mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
- movd eax, mm1
-
- add rdx, 4
- cmp edx, dword ptr arg(5) ;cols
- jl .acrossnextcol;
-
- mov DWORD PTR [rdi+rdx-4], eax
- pop rax
-
- ; done with this rwo
- add rsi,rax ; next line
- movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE
-sym(vp9_mbpost_proc_down_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 136
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp9_rv))]
-%endif
-
- ;rows +=8;
- add dword ptr arg(2), 8
-
- ;for(c=0; c<cols; c+=4)
-.loop_col:
- mov rsi, arg(0) ;s
- pxor mm0, mm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movd mm1, DWORD PTR [rdi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
- movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [rsi+rax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp9_rv))]
- movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
- movq mm4, [sym(vp9_rv) + rcx*2]
-%endif
- paddw mm1, mm4
- ;paddw xmm1, eight8s
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and rcx, 15
- movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
- movd [rsi], mm1
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
-
- add dword arg(0), 4 ; s += 4
- sub dword arg(3), 4 ; cols -= 4
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 136
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
-; unsigned char blackclamp[16],
-; unsigned char whiteclamp[16],
-; unsigned char bothclamp[16],
-; unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_mmx) PRIVATE
-sym(vp9_plane_add_noise_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-.addnoise_loop:
- call sym(LIBVPX_RAND) WRT_PLT
- mov rcx, arg(1) ;noise
- and rax, 0xff
- add rcx, rax
-
- ; we rely on the fact that the clamping vectors are stored contiguously
- ; in black/white/both order. Note that we have to reload this here because
- ; rdx could be trashed by rand()
- mov rdx, arg(2) ; blackclamp
-
-
- mov rdi, rcx
- movsxd rcx, dword arg(5) ;[Width]
- mov rsi, arg(0) ;Pos
- xor rax,rax
-
-.addnoise_nextset:
- movq mm1,[rsi+rax] ; get the source
-
- psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
- paddusb mm1, [rdx+32] ;bothclamp
- psubusb mm1, [rdx+16] ;whiteclamp
-
- movq mm2,[rdi+rax] ; get the noise for this line
- paddb mm1,mm2 ; add it in
- movq [rsi+rax],mm1 ; store the result
-
- add rax,8 ; move to the next line
-
- cmp rax, rcx
- jl .addnoise_nextset
-
- movsxd rax, dword arg(7) ; Pitch
- add arg(0), rax ; Start += Pitch
- sub dword arg(6), 1 ; Height -= 1
- jg .addnoise_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-Blur:
- times 16 dw 16
- times 8 dw 64
- times 16 dw 16
- times 8 dw 0
-
-rd:
- times 4 dw 0x40
diff --git a/source/libvpx/vp9/decoder/vp9_decodeframe.c b/source/libvpx/vp9/decoder/vp9_decodeframe.c
index a0fff45..a9c03f0 100644
--- a/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -330,6 +330,9 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
if (!vp9_is_valid_scale(&ref_buffer->sf))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid scale factors");
+ if (ref_buffer->buf->corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Block reference is corrupt");
vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
&ref_buffer->sf);
xd->corrupted |= ref_buffer->buf->corrupted;
@@ -627,11 +630,14 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
"Width and height beyond allowed size.");
#endif
if (cm->width != width || cm->height != height) {
- const int new_rows = ALIGN_POWER_OF_TWO(height,
- MI_SIZE_LOG2) >> MI_SIZE_LOG2;
- const int new_cols = ALIGN_POWER_OF_TWO(width,
- MI_SIZE_LOG2) >> MI_SIZE_LOG2;
- if (calc_mi_size(new_rows) * calc_mi_size(new_cols) > cm->mi_alloc_size) {
+ const int new_mi_rows =
+ ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+ const int new_mi_cols =
+ ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+ // Allocations in vp9_alloc_context_buffers() depend on individual
+ // dimensions as well as the overall size.
+ if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
if (vp9_alloc_context_buffers(cm, width, height))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate context buffers");
@@ -652,7 +658,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_DEC_BORDER_IN_PIXELS,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -670,6 +680,10 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
width = buf->y_crop_width;
height = buf->y_crop_height;
+ if (buf->corrupted) {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Frame reference is corrupt");
+ }
found = 1;
break;
}
@@ -699,7 +713,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_DEC_BORDER_IN_PIXELS,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -812,6 +830,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ // Be sure to sync as we might be resuming after a failed frame decode.
+ winterface->sync(&pbi->lf_worker);
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
@@ -881,7 +901,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
pbi->mb.corrupted |= tile_data->xd.corrupted;
}
// Loopfilter one row.
- if (cm->lf.filter_level) {
+ if (cm->lf.filter_level && !pbi->mb.corrupted) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
@@ -904,7 +924,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
}
// Loopfilter remaining rows in the frame.
- if (cm->lf.filter_level) {
+ if (cm->lf.filter_level && !pbi->mb.corrupted) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
winterface->sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
@@ -993,6 +1013,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
+ winterface->sync(&pbi->tile_workers[n]);
pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
}
@@ -1096,7 +1117,7 @@ BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
static void read_bitdepth_colorspace_sampling(
VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
if (cm->profile >= PROFILE_2)
- cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
+ cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
if (cm->color_space != SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
@@ -1140,6 +1161,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
"Invalid frame marker");
cm->profile = vp9_read_profile(rb);
+
if (cm->profile >= MAX_PROFILES)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Unsupported bitstream profile");
@@ -1398,7 +1420,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
if (!first_partition_size) {
// showing a frame directly
- *p_data_end = data + 1;
+ *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
return;
}
@@ -1429,9 +1451,11 @@ void vp9_decode_frame(VP9Decoder *pbi,
if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
cm->frame_parallel_decoding_mode) {
*p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
- // If multiple threads are used to decode tiles, then we use those threads
- // to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ if (!xd->corrupted) {
+ // If multiple threads are used to decode tiles, then we use those threads
+ // to do parallel loopfiltering.
+ vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ }
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
diff --git a/source/libvpx/vp9/decoder/vp9_decoder.c b/source/libvpx/vp9/decoder/vp9_decoder.c
index e79dcf3..9106b0d 100644
--- a/source/libvpx/vp9/decoder/vp9_decoder.c
+++ b/source/libvpx/vp9/decoder/vp9_decoder.c
@@ -25,6 +25,7 @@
#include "vp9/common/vp9_postproc.h"
#endif
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/decoder/vp9_decodeframe.h"
@@ -36,7 +37,9 @@ static void initialize_dec() {
static int init_done = 0;
if (!init_done) {
+ vp9_rtcd();
vp9_init_neighbors();
+ vp9_init_intra_predictors();
init_done = 1;
}
}
@@ -59,13 +62,12 @@ VP9Decoder *vp9_decoder_create() {
cm->error.setjmp = 1;
initialize_dec();
- vp9_rtcd();
-
// Initialize the references to not point to any frame buffers.
vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
cm->current_video_frame = 0;
pbi->ready_for_new_data = 1;
+ cm->bit_depth = VPX_BITS_8;
// vp9_init_dequantizer() is first called here. Add check in
// frame_init_dequantizer() to avoid unnecessary calling of
@@ -96,10 +98,8 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
}
vpx_free(pbi->tile_workers);
- if (pbi->num_tile_workers) {
- const int sb_rows =
- mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
- vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows);
+ if (pbi->num_tile_workers > 0) {
+ vp9_loop_filter_dealloc(&pbi->lf_row_sync);
}
vp9_remove_common(cm);
diff --git a/source/libvpx/vp9/decoder/vp9_dthread.c b/source/libvpx/vp9/decoder/vp9_dthread.c
index 5dda49a..b82ea6a 100644
--- a/source/libvpx/vp9/decoder/vp9_dthread.c
+++ b/source/libvpx/vp9/decoder/vp9_dthread.c
@@ -147,17 +147,8 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
// Allocate memory used in thread synchronization.
// This always needs to be done even if frame_filter_level is 0.
- if (!cm->current_video_frame || cm->last_height != cm->height) {
- if (cm->last_height != cm->height) {
- const int aligned_last_height =
- ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
- const int last_sb_rows =
- mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
- MI_BLOCK_SIZE_LOG2;
-
- vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
- }
-
+ if (!lf_sync->sync_range || cm->last_height != cm->height) {
+ vp9_loop_filter_dealloc(lf_sync);
vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
}
@@ -227,19 +218,22 @@ static int get_sync_range(int width) {
// Allocate memory for lf row synchronization
void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
int width) {
+ lf_sync->rows = rows;
#if CONFIG_MULTITHREAD
- int i;
+ {
+ int i;
- CHECK_MEM_ERROR(cm, lf_sync->mutex_,
- vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
- for (i = 0; i < rows; ++i) {
- pthread_mutex_init(&lf_sync->mutex_[i], NULL);
- }
+ CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+ vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+ }
- CHECK_MEM_ERROR(cm, lf_sync->cond_,
- vpx_malloc(sizeof(*lf_sync->cond_) * rows));
- for (i = 0; i < rows; ++i) {
- pthread_cond_init(&lf_sync->cond_[i], NULL);
+ CHECK_MEM_ERROR(cm, lf_sync->cond_,
+ vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->cond_[i], NULL);
+ }
}
#endif // CONFIG_MULTITHREAD
@@ -251,23 +245,19 @@ void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
}
// Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
-#if !CONFIG_MULTITHREAD
- (void)rows;
-#endif // !CONFIG_MULTITHREAD
-
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
if (lf_sync != NULL) {
#if CONFIG_MULTITHREAD
int i;
if (lf_sync->mutex_ != NULL) {
- for (i = 0; i < rows; ++i) {
+ for (i = 0; i < lf_sync->rows; ++i) {
pthread_mutex_destroy(&lf_sync->mutex_[i]);
}
vpx_free(lf_sync->mutex_);
}
if (lf_sync->cond_ != NULL) {
- for (i = 0; i < rows; ++i) {
+ for (i = 0; i < lf_sync->rows; ++i) {
pthread_cond_destroy(&lf_sync->cond_[i]);
}
vpx_free(lf_sync->cond_);
diff --git a/source/libvpx/vp9/decoder/vp9_dthread.h b/source/libvpx/vp9/decoder/vp9_dthread.h
index 423bd88..8b02ef7 100644
--- a/source/libvpx/vp9/decoder/vp9_dthread.h
+++ b/source/libvpx/vp9/decoder/vp9_dthread.h
@@ -38,6 +38,7 @@ typedef struct VP9LfSyncData {
// The optimal sync_range for different resolution and platform should be
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
+ int rows;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
@@ -45,7 +46,7 @@ void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
int rows, int width);
// Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows);
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
// Multi-threaded loopfilter that uses the tile threads.
void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.c b/source/libvpx/vp9/encoder/vp9_bitstream.c
index bdb1338..b605248 100644
--- a/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -294,6 +294,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
vp9_write_token(w, vp9_switchable_interp_tree,
cm->fc.switchable_interp_prob[ctx],
&switchable_interp_encodings[mbmi->interp_filter]);
+ ++cpi->interp_filter_selected[0][mbmi->interp_filter];
} else {
assert(mbmi->interp_filter == cm->interp_filter);
}
@@ -670,8 +671,6 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
- vp9_clear_system_state();
-
for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
frame_coef_probs[tx_size]);
@@ -998,8 +997,10 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
// Set "found" to 0 for temporal svc and for spatial svc key frame
if (cpi->use_svc &&
- (cpi->svc.number_spatial_layers == 1 ||
- cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
+ ((cpi->svc.number_temporal_layers > 1 &&
+ cpi->oxcf.rc_mode == VPX_CBR) ||
+ (cpi->svc.number_spatial_layers > 1 &&
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
found = 0;
}
vp9_wb_write_bit(wb, found);
@@ -1045,8 +1046,8 @@ static void write_profile(BITSTREAM_PROFILE profile,
static void write_bitdepth_colorspace_sampling(
VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
if (cm->profile >= PROFILE_2) {
- assert(cm->bit_depth > BITS_8);
- vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+ assert(cm->bit_depth > VPX_BITS_8);
+ vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
}
vp9_wb_write_literal(wb, cm->color_space, 3);
if (cm->color_space != SRGB) {
@@ -1083,7 +1084,16 @@ static void write_uncompressed_header(VP9_COMP *cpi,
write_bitdepth_colorspace_sampling(cm, wb);
write_frame_size(cm, wb);
} else {
- if (!cm->show_frame)
+ // In spatial svc if it's not error_resilient_mode then we need to code all
+ // visible frames as invisible. But we need to keep the show_frame flag so
+ // that the publisher could know whether it is supposed to be visible.
+ // So we will code the show_frame flag as it is. Then code the intra_only
+ // bit here. This will make the bitstream incompatible. In the player we
+ // will change to show_frame flag to 0, then add an one byte frame with
+ // show_existing_frame flag which tells the decoder which frame we want to
+ // show.
+ if (!cm->show_frame ||
+ (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0))
vp9_wb_write_bit(wb, cm->intra_only);
if (!cm->error_resilient_mode)
diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.h b/source/libvpx/vp9/encoder/vp9_bitstream.h
index 8e82d1c..b488261 100644
--- a/source/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/source/libvpx/vp9/encoder/vp9_bitstream.h
@@ -26,7 +26,7 @@ static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
cpi->rc.is_src_frame_alt_ref &&
(!cpi->use_svc || // Add spatial svc base layer case here
- (is_spatial_svc(cpi) &&
+ (is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id == 0 &&
cpi->svc.layer_context[0].gold_ref_idx >=0 &&
cpi->oxcf.ss_play_alternate[0]));
diff --git a/source/libvpx/vp9/encoder/vp9_block.h b/source/libvpx/vp9/encoder/vp9_block.h
index bd3b0fd..b726383 100644
--- a/source/libvpx/vp9/encoder/vp9_block.h
+++ b/source/libvpx/vp9/encoder/vp9_block.h
@@ -76,16 +76,12 @@ struct macroblock {
int pred_mv_sad[MAX_REF_FRAMES];
int nmvjointcost[MV_JOINTS];
- int nmvcosts[2][MV_VALS];
int *nmvcost[2];
- int nmvcosts_hp[2][MV_VALS];
int *nmvcost_hp[2];
int **mvcost;
int nmvjointsadcost[MV_JOINTS];
- int nmvsadcosts[2][MV_VALS];
int *nmvsadcost[2];
- int nmvsadcosts_hp[2][MV_VALS];
int *nmvsadcost_hp[2];
int **mvsadcost;
@@ -116,9 +112,9 @@ struct macroblock {
int quant_fp;
// skip forward transform and quantization
- int skip_txfm[MAX_MB_PLANE];
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
- int64_t bsse[MAX_MB_PLANE];
+ int64_t bsse[MAX_MB_PLANE << 2];
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
diff --git a/source/libvpx/vp9/encoder/vp9_context_tree.h b/source/libvpx/vp9/encoder/vp9_context_tree.h
index d60e6c3..236389b 100644
--- a/source/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/source/libvpx/vp9/encoder/vp9_context_tree.h
@@ -33,7 +33,10 @@ typedef struct {
int is_coded;
int num_4x4_blk;
int skip;
- int skip_txfm[MAX_MB_PLANE];
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 0.
+ int skippable;
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
int best_mode_index;
int hybrid_pred_diff;
int comp_pred_diff;
diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.c b/source/libvpx/vp9/encoder/vp9_denoiser.c
index 90ea9cc..c4cf5ee 100644
--- a/source/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/source/libvpx/vp9/encoder/vp9_denoiser.c
@@ -78,7 +78,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
int mc_avg_stride,
uint8_t *avg, int avg_stride,
int increase_denoising,
- BLOCK_SIZE bs) {
+ BLOCK_SIZE bs,
+ int motion_magnitude) {
int r, c;
const uint8_t *sig_start = sig;
const uint8_t *mc_avg_start = mc_avg;
@@ -86,6 +87,19 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
int diff, adj, absdiff, delta;
int adj_val[] = {3, 4, 6};
int total_adj = 0;
+ int shift_inc = 1;
+
+ /* If motion_magnitude is small, making the denoiser more aggressive by
+ * increasing the adjustment for each level. Add another increment for
+ * blocks that are labeled for increase denoising. */
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ if (increase_denoising) {
+ shift_inc = 2;
+ }
+ adj_val[0] += shift_inc;
+ adj_val[1] += shift_inc;
+ adj_val[2] += shift_inc;
+ }
// First attempt to apply a strong temporal denoising filter.
for (r = 0; r < heights[bs]; ++r) {
@@ -130,7 +144,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
// Otherwise, we try to dampen the filter if the delta is not too high.
delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
>> 8) + 1;
- if (delta > delta_thresh(bs, increase_denoising)) {
+
+ if (delta >= delta_thresh(bs, increase_denoising)) {
return COPY_BLOCK;
}
@@ -145,11 +160,17 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
adj = delta;
}
if (diff > 0) {
+ // Diff positive means we made positive adjustment above
+ // (in first try/attempt), so now make negative adjustment to bring
+ // denoised signal down.
avg[c] = MAX(0, avg[c] - adj);
- total_adj += adj;
+ total_adj -= adj;
} else {
+ // Diff negative means we made negative adjustment above
+ // (in first try/attempt), so now make positive adjustment to bring
+ // denoised signal up.
avg[c] = MIN(UINT8_MAX, avg[c] + adj);
- total_adj -= adj;
+ total_adj += adj;
}
}
sig += sig_stride;
@@ -185,7 +206,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
int increase_denoising,
int mi_row,
int mi_col,
- PICK_MODE_CONTEXT *ctx
+ PICK_MODE_CONTEXT *ctx,
+ int *motion_magnitude
) {
int mv_col, mv_row;
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -210,6 +232,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
mv_col = ctx->best_sse_mv.as_mv.col;
mv_row = ctx->best_sse_mv.as_mv.row;
+ *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
frame = ctx->best_reference_frame;
// If the best reference frame uses inter-prediction and there is enough of a
@@ -297,6 +321,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
int mi_row, int mi_col, BLOCK_SIZE bs,
PICK_MODE_CONTEXT *ctx) {
+ int motion_magnitude = 0;
VP9_DENOISER_DECISION decision = FILTER_BLOCK;
YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -307,13 +332,14 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
decision = perform_motion_compensation(denoiser, mb, bs,
denoiser->increase_denoising,
- mi_row, mi_col, ctx);
+ mi_row, mi_col, ctx,
+ &motion_magnitude);
if (decision == FILTER_BLOCK) {
decision = denoiser_filter(src.buf, src.stride,
mc_avg_start, mc_avg.y_stride,
avg_start, avg.y_stride,
- 0, bs);
+ 0, bs, motion_magnitude);
}
if (decision == FILTER_BLOCK) {
@@ -370,8 +396,8 @@ void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
ctx->newmv_sse = UINT_MAX;
}
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
- unsigned int sse, PREDICTION_MODE mode,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+ PREDICTION_MODE mode,
PICK_MODE_CONTEXT *ctx) {
// TODO(tkopp): Use both MVs if possible
if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
@@ -388,13 +414,21 @@ void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
}
int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
- int ssx, int ssy, int border) {
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border) {
int i, fail;
assert(denoiser != NULL);
for (i = 0; i < MAX_REF_FRAMES; ++i) {
fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
- ssx, ssy, border);
+ ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
@@ -405,7 +439,11 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
}
fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
- ssx, ssy, border);
+ ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.h b/source/libvpx/vp9/encoder/vp9_denoiser.h
index d93846f..a913add 100644
--- a/source/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/source/libvpx/vp9/encoder/vp9_denoiser.h
@@ -18,6 +18,8 @@
extern "C" {
#endif
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+
typedef enum vp9_denoiser_decision {
COPY_BLOCK,
FILTER_BLOCK
@@ -42,12 +44,16 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
unsigned int sse, PREDICTION_MODE mode,
PICK_MODE_CONTEXT *ctx);
int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
- int ssx, int ssy, int border);
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border);
void vp9_denoiser_free(VP9_DENOISER *denoiser);
diff --git a/source/libvpx/vp9/encoder/vp9_encodeframe.c b/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 711354b..72ced05 100644
--- a/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -727,6 +727,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
p[i].eobs = ctx->eobs_pbuf[i][0];
}
ctx->is_coded = 0;
+ ctx->skippable = 0;
x->skip_recode = 0;
// Set to zero to make sure we do not use the previous encoded frame stats
@@ -1232,30 +1233,23 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
}
}
-static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
+static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
int mi_row, int mi_col) {
- MACROBLOCK *x = &cpi->mb;
- uint8_t *src, *pre;
- int src_stride, pre_stride;
-
+ // This assumes the input source frames are of the same dimension.
const int row8x8_remaining = tile->mi_row_end - mi_row;
const int col8x8_remaining = tile->mi_col_end - mi_col;
-
+ const int x = mi_col * MI_SIZE;
+ const int y = mi_row * MI_SIZE;
+ const int src_stride = cpi->Source->y_stride;
+ const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
+ const int pre_stride = cpi->Last_Source->y_stride;
+ const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
int this_sad = 0;
int threshold = 0;
- // This assumes the input source frames are of the same dimension.
- src_stride = cpi->Source->y_stride;
- src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
- (mi_col * MI_SIZE);
- pre_stride = cpi->Last_Source->y_stride;
- pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
- (mi_col * MI_SIZE);
-
if (row8x8_remaining >= MI_BLOCK_SIZE &&
col8x8_remaining >= MI_BLOCK_SIZE) {
- this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
- pre, pre_stride);
+ this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
threshold = (1 << 12);
} else {
int r, c;
@@ -1266,8 +1260,7 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
threshold = (row8x8_remaining * col8x8_remaining) << 6;
}
- x->in_static_area = (this_sad < 2 * threshold);
- return x->in_static_area;
+ return this_sad < 2 * threshold;
}
static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
@@ -2166,8 +2159,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
if (sum_rd < best_rd) {
- int64_t stop_thresh = 4096;
- int64_t stop_thresh_rd;
+ int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+ int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
best_rate = this_rate;
best_dist = this_dist;
@@ -2175,14 +2168,18 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (bsize >= BLOCK_8X8)
pc_tree->partitioning = PARTITION_NONE;
- // Adjust threshold according to partition size.
- stop_thresh >>= 8 - (b_width_log2(bsize) +
+ // Adjust dist breakout threshold according to the partition size.
+ dist_breakout_thr >>= 8 - (b_width_log2(bsize) +
b_height_log2(bsize));
- stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
- // If obtained distortion is very small, choose current partition
- // and stop splitting.
- if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+ // If all y, u, v transform blocks in this partition are skippable, and
+ // the dist & rate are within the thresholds, the partition search is
+ // terminated for current branch of the partition search tree.
+ // The dist & rate thresholds are set to 0 at speed 0 to disable the
+ // early termination at that speed.
+ if (!x->e_mbd.lossless &&
+ (ctx->skippable && best_dist < dist_breakout_thr &&
+ best_rate < rate_breakout_thr)) {
do_split = 0;
do_rect = 0;
}
@@ -2606,8 +2603,6 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
if (cpi->mb.e_mbd.lossless)
return ONLY_4X4;
- if (cpi->common.frame_type == KEY_FRAME)
- return TX_MODE_SELECT;
if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
return ALLOW_32X32;
else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
@@ -3119,7 +3114,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
break;
case REFERENCE_PARTITION:
if (sf->partition_check ||
- !is_background(cpi, tile, mi_row, mi_col)) {
+ !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
set_modeinfo_offsets(cm, xd, mi_row, mi_col);
auto_partition_range(cpi, tile, mi_row, mi_col,
&sf->min_partition_size,
@@ -3297,7 +3292,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_zero(cm->counts);
vp9_zero(cpi->coef_counts);
- vp9_zero(cpi->tx_stepdown_count);
vp9_zero(rd_opt->comp_pred_diff);
vp9_zero(rd_opt->filter_diff);
vp9_zero(rd_opt->tx_select_diff);
diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.c b/source/libvpx/vp9/encoder/vp9_encodemb.c
index 8a737e1..6678450 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -476,20 +476,24 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
}
if (!x->skip_recode) {
- if (x->skip_txfm[plane] == 0) {
- // full forward transform and quantization
- if (x->quant_fp)
- vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
- else
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
- } else if (x->skip_txfm[plane] == 2) {
- // fast path forward transform and quantization
- vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
+ // full forward transform and quantization
+ if (x->quant_fp)
+ vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+ else
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+ return;
+ }
} else {
- // skip forward transform
- p->eobs[block] = 0;
- *a = *l = 0;
- return;
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
}
}
diff --git a/source/libvpx/vp9/encoder/vp9_encodemv.c b/source/libvpx/vp9/encoder/vp9_encodemv.c
index 9ad6db0..9d42a12 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -216,7 +216,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
// If auto_mv_step_size is enabled then keep track of the largest
// motion vector component used.
- if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
+ if (cpi->sf.mv.auto_mv_step_size) {
unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
}
diff --git a/source/libvpx/vp9/encoder/vp9_encoder.c b/source/libvpx/vp9/encoder/vp9_encoder.c
index d27620c..2ca91b9 100644
--- a/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -24,6 +24,7 @@
#include "vp9/common/vp9_postproc.h"
#endif
#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_tile_common.h"
@@ -128,11 +129,13 @@ static void setup_frame(VP9_COMP *cpi) {
}
if (cm->frame_type == KEY_FRAME) {
- if (!is_spatial_svc(cpi))
+ if (!is_two_pass_svc(cpi))
cpi->refresh_golden_frame = 1;
cpi->refresh_alt_ref_frame = 1;
+ vp9_zero(cpi->interp_filter_selected);
} else {
cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ vp9_zero(cpi->interp_filter_selected[0]);
}
}
@@ -140,7 +143,9 @@ void vp9_initialize_enc() {
static int init_done = 0;
if (!init_done) {
+ vp9_rtcd();
vp9_init_neighbors();
+ vp9_init_intra_predictors();
vp9_coef_tree_initialize();
vp9_tokenize_initialize();
vp9_init_me_luts();
@@ -167,6 +172,26 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->complexity_map);
cpi->complexity_map = NULL;
+ vpx_free(cpi->nmvcosts[0]);
+ vpx_free(cpi->nmvcosts[1]);
+ cpi->nmvcosts[0] = NULL;
+ cpi->nmvcosts[1] = NULL;
+
+ vpx_free(cpi->nmvcosts_hp[0]);
+ vpx_free(cpi->nmvcosts_hp[1]);
+ cpi->nmvcosts_hp[0] = NULL;
+ cpi->nmvcosts_hp[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts[0]);
+ vpx_free(cpi->nmvsadcosts[1]);
+ cpi->nmvsadcosts[0] = NULL;
+ cpi->nmvsadcosts[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts_hp[0]);
+ vpx_free(cpi->nmvsadcosts_hp[1]);
+ cpi->nmvsadcosts_hp[0] = NULL;
+ cpi->nmvsadcosts_hp[1] = NULL;
+
vp9_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
@@ -212,8 +237,15 @@ static void save_coding_context(VP9_COMP *cpi) {
// intended for use in a re-code loop in vp9_compress_frame where the
// quantizer value is adjusted between loop iterations.
vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
- vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts);
- vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp);
+
+ vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+ MV_VALS * sizeof(*cpi->nmvcosts[0]));
+ vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+ MV_VALS * sizeof(*cpi->nmvcosts[1]));
+ vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+ vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
@@ -233,8 +265,15 @@ static void restore_coding_context(VP9_COMP *cpi) {
// Restore key state variables to the snapshot state stored in the
// previous call to vp9_save_coding_context.
vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
- vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
- vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+ vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
+ MV_VALS * sizeof(*cc->nmvcosts[0]));
+ vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
+ MV_VALS * sizeof(*cc->nmvcosts[1]));
+ vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+ vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
@@ -386,27 +425,15 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) {
}
}
-
-static void set_speed_features(VP9_COMP *cpi) {
-#if CONFIG_INTERNAL_STATS
- int i;
- for (i = 0; i < MAX_MODES; ++i)
- cpi->mode_chosen_counts[i] = 0;
-#endif
-
- vp9_set_speed_features(cpi);
-
- // Set rd thresholds based on mode and speed setting
- vp9_set_rd_speed_thresholds(cpi);
- vp9_set_rd_speed_thresholds_sub8x8(cpi);
-}
-
static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
const VP9EncoderConfig *oxcf = &cpi->oxcf;
cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
oxcf->lag_in_frames);
if (!cpi->lookahead)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -415,6 +442,9 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
oxcf->width, oxcf->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
@@ -432,6 +462,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -439,6 +472,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
if (vp9_realloc_frame_buffer(&cpi->scaled_source,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
@@ -446,6 +482,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
@@ -474,10 +513,13 @@ static void update_frame_size(VP9_COMP *cpi) {
vp9_init_context_buffers(cm);
init_macroblockd(cm, xd);
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
@@ -526,7 +568,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass == 2)) {
vp9_init_layer_context(cpi);
}
@@ -564,9 +608,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cm->bit_depth = oxcf->bit_depth;
if (cm->profile <= PROFILE_1)
- assert(cm->bit_depth == BITS_8);
+ assert(cm->bit_depth == VPX_BITS_8);
else
- assert(cm->bit_depth > BITS_8);
+ assert(cm->bit_depth > VPX_BITS_8);
cpi->oxcf = *oxcf;
@@ -618,7 +662,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
if ((cpi->svc.number_temporal_layers > 1 &&
cpi->oxcf.rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass == 2)) {
vp9_update_layer_context_change_config(cpi,
(int)cpi->oxcf.target_bandwidth);
}
@@ -641,6 +687,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
if (cpi->oxcf.noise_sensitivity > 0) {
vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS);
}
#endif
@@ -707,8 +756,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
cm->error.setjmp = 1;
- vp9_rtcd();
-
cpi->use_svc = 0;
init_config(cpi, oxcf);
@@ -734,6 +781,23 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
sizeof(cpi->mbgraph_stats[0])); i++) {
CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@@ -814,16 +878,16 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
cpi->first_time_stamp_ever = INT64_MAX;
cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
- cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
- cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
- cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
- cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+ cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+ cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+ cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+ cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
cal_nmvsadcosts(cpi->mb.nmvsadcost);
- cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
- cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
- cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
- cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+ cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+ cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+ cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+ cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
#if CONFIG_VP9_TEMPORAL_DENOISING
@@ -840,8 +904,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
kf_list = fopen("kf_list.stt", "w");
#endif
- cpi->output_pkt_list = oxcf->output_pkt_list;
-
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
if (oxcf->pass == 1) {
@@ -851,7 +913,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
if (cpi->svc.number_spatial_layers > 1
- && cpi->svc.number_temporal_layers == 1) {
+ || cpi->svc.number_temporal_layers > 1) {
FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
int i;
@@ -909,7 +971,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
}
}
- set_speed_features(cpi);
+ vp9_set_speed_features(cpi);
// Allocate memory to store variances for a frame.
CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -1394,40 +1456,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
vp9_extend_frame_borders(dst);
}
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
- FILE *yframe;
- int i;
- char filename[255];
-
- snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->y_height; i++)
- fwrite(frame->y_buffer + i * frame->y_stride,
- frame->y_width, 1, yframe);
-
- fclose(yframe);
- snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->u_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
- snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->v_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
-}
-#endif
-
// Function to test for conditions that indicate we should loop
// back and recode a frame.
static int recode_loop_test(const VP9_COMP *cpi,
@@ -1493,7 +1521,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
cpi->alt_fb_idx = cpi->gld_fb_idx;
cpi->gld_fb_idx = tmp;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
}
@@ -1507,17 +1535,32 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+ vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
if (cpi->refresh_golden_frame) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ else
+ vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[ALTREF_FRAME],
+ sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
}
}
if (cpi->refresh_last_frame) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1572,6 +1615,9 @@ void vp9_scale_references(VP9_COMP *cpi) {
vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -1746,7 +1792,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
// to recode.
if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
save_coding_context(cpi);
- cpi->dummy_packing = 1;
if (!cpi->sf.use_nonrd_pick_mode)
vp9_pack_bitstream(cpi, dest, size);
@@ -1905,8 +1950,7 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) {
if (gold_is_last)
flags &= ~VP9_GOLD_FLAG;
- if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
- !is_spatial_svc(cpi))
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
flags &= ~VP9_GOLD_FLAG;
if (alt_is_last)
@@ -1947,18 +1991,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
}
}
-static void configure_skippable_frame(VP9_COMP *cpi) {
+static int is_skippable_frame(const VP9_COMP *cpi) {
// If the current frame does not have non-zero motion vector detected in the
// first pass, and so do its previous and forward frames, then this frame
// can be skipped for partition check, and the partition size is assigned
// according to the variance
+ const SVC *const svc = &cpi->svc;
+ const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+ &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
- SVC *const svc = &cpi->svc;
- TWO_PASS *const twopass = is_spatial_svc(cpi) ?
- &svc->layer_context[svc->spatial_layer_id].twopass
- : &cpi->twopass;
-
- cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
+ return (!frame_is_intra_only(&cpi->common) &&
twopass->stats_in - 2 > twopass->stats_in_start &&
twopass->stats_in < twopass->stats_in_end &&
(twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
@@ -2008,11 +2050,39 @@ static void set_mv_search_params(VP9_COMP *cpi) {
}
}
+
+int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+ INTERP_FILTER ifilter;
+ int ref_total[MAX_REF_FRAMES] = {0};
+ MV_REFERENCE_FRAME ref;
+ int mask = 0;
+ if (cpi->common.last_frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame)
+ return mask;
+ for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+ if ((ref_total[LAST_FRAME] &&
+ cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+ (ref_total[GOLDEN_FRAME] == 0 ||
+ cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+ < ref_total[GOLDEN_FRAME]) &&
+ (ref_total[ALTREF_FRAME] == 0 ||
+ cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+ < ref_total[ALTREF_FRAME]))
+ mask |= 1 << ifilter;
+ }
+ return mask;
+}
+
static void encode_frame_to_data_rate(VP9_COMP *cpi,
size_t *size,
uint8_t *dest,
unsigned int *frame_flags) {
VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
struct segmentation *const seg = &cm->seg;
TX_SIZE t;
int q;
@@ -2046,6 +2116,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
set_mv_search_params(cpi);
+ if (cpi->oxcf.pass == 2 &&
+ cpi->sf.adaptive_interp_filter_search)
+ cpi->sf.interp_filter_search_mask =
+ setup_interp_filter_search_mask(cpi);
+
+
// Set various flags etc to special state if it is a key frame.
if (frame_is_intra_only(cm)) {
// Reset the loop filter deltas and segmentation map.
@@ -2060,9 +2136,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// The alternate reference frame cannot be active for a key frame.
cpi->rc.source_alt_ref_active = 0;
- cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
- cm->frame_parallel_decoding_mode =
- (cpi->oxcf.frame_parallel_decoding_mode != 0);
+ cm->error_resilient_mode = oxcf->error_resilient_mode;
// By default, encoder assumes decoder can use prev_mi.
if (cm->error_resilient_mode) {
@@ -2070,29 +2144,59 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cm->reset_frame_context = 0;
cm->refresh_frame_context = 0;
} else if (cm->intra_only) {
+ cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
// Only reset the current context.
cm->reset_frame_context = 2;
}
}
+ if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+ cm->frame_context_idx =
+ cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id;
+
+ // The probs will be updated based on the frame type of its previous
+ // frame if frame_parallel_decoding_mode is 0. The type may vary for
+ // the frame after a key frame in base layer since we may drop enhancement
+ // layers. So set frame_parallel_decoding_mode to 1 in this case.
+ if (cpi->svc.number_temporal_layers == 1) {
+ if (cpi->svc.spatial_layer_id == 0 &&
+ cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+ cm->frame_parallel_decoding_mode = 1;
+ else
+ cm->frame_parallel_decoding_mode = 0;
+ } else if (cpi->svc.spatial_layer_id == 0) {
+ // Find the 2nd frame in temporal base layer and 1st frame in temporal
+ // enhancement layers from the key frame.
+ int i;
+ for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+ if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+ cm->frame_parallel_decoding_mode = 1;
+ break;
+ }
+ }
+ if (i == cpi->svc.number_temporal_layers)
+ cm->frame_parallel_decoding_mode = 0;
+ }
+ }
// Configure experimental use of segmentation for enhanced coding of
// static regions if indicated.
// Only allowed in second pass of two pass (as requires lagged coding)
// and if the relevant speed feature flag is set.
- if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
+ if (oxcf->pass == 2 && cpi->sf.static_segmentation)
configure_static_seg_features(cpi);
// Check if the current frame is skippable for the partition search in the
// second pass according to the first pass stats
- if (cpi->oxcf.pass == 2 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
- configure_skippable_frame(cpi);
+ if (oxcf->pass == 2 &&
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
+ cpi->skippable_frame = is_skippable_frame(cpi);
}
// For 1 pass CBR, check if we are dropping this frame.
// Never drop on key frame.
- if (cpi->oxcf.pass == 0 &&
- cpi->oxcf.rc_mode == VPX_CBR &&
+ if (oxcf->pass == 0 &&
+ oxcf->rc_mode == VPX_CBR &&
cm->frame_type != KEY_FRAME) {
if (vp9_rc_drop_frame(cpi)) {
vp9_rc_postencode_update_drop_frame(cpi);
@@ -2104,9 +2208,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_clear_system_state();
#if CONFIG_VP9_POSTPROC
- if (cpi->oxcf.noise_sensitivity > 0) {
+ if (oxcf->noise_sensitivity > 0) {
int l = 0;
- switch (cpi->oxcf.noise_sensitivity) {
+ switch (oxcf->noise_sensitivity) {
case 1:
l = 20;
break;
@@ -2128,7 +2232,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
#endif
- set_speed_features(cpi);
+#if CONFIG_INTERNAL_STATS
+ int i;
+ for (i = 0; i < MAX_MODES; ++i)
+ cpi->mode_chosen_counts[i] = 0;
+#endif
+
+ vp9_set_speed_features(cpi);
+
+ vp9_set_rd_speed_thresholds(cpi);
+ vp9_set_rd_speed_thresholds_sub8x8(cpi);
// Decide q and q bounds.
q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
@@ -2147,7 +2260,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
#if CONFIG_VP9_TEMPORAL_DENOISING
#ifdef OUTPUT_YUV_DENOISED
- if (cpi->oxcf.noise_sensitivity > 0) {
+ if (oxcf->noise_sensitivity > 0) {
vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
yuv_denoised_file);
}
@@ -2168,29 +2281,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cm->frame_to_show = get_frame_new_buffer(cm);
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame);
- else
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 1000);
-#endif
-
// Pick the loop filter level for the frame.
loopfilter_frame(cpi, cm);
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 2000);
- else
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 3000);
-#endif
-
// build the bitstream
- cpi->dummy_packing = 0;
vp9_pack_bitstream(cpi, dest, size);
if (cm->seg.update_map)
@@ -2249,8 +2343,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cm->last_height = cm->height;
// reset to normal state now that we are done.
- if (!cm->show_existing_frame)
- cm->last_show_frame = cm->show_frame;
+ if (!cm->show_existing_frame) {
+ if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)
+ cm->last_show_frame = 0;
+ else
+ cm->last_show_frame = cm->show_frame;
+ }
if (cm->show_frame) {
vp9_swap_mi_and_prev_mi(cm);
@@ -2259,8 +2357,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// update not a real frame
++cm->current_video_frame;
if (cpi->use_svc)
- vp9_inc_frame_in_layer(&cpi->svc);
+ vp9_inc_frame_in_layer(cpi);
}
+
+ if (is_two_pass_svc(cpi))
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
+ cm->frame_type;
}
static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -2333,7 +2435,7 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
vpx_usec_timer_start(&timer);
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
+ if (is_two_pass_svc(cpi))
res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
frame_flags);
else
@@ -2375,8 +2477,8 @@ static int frame_is_reference(const VP9_COMP *cpi) {
cm->seg.update_data;
}
-void adjust_frame_rate(VP9_COMP *cpi) {
- const struct lookahead_entry *const source = cpi->source;
+void adjust_frame_rate(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
int64_t this_duration;
int step = 0;
@@ -2432,7 +2534,8 @@ static int get_arf_src_index(VP9_COMP *cpi) {
return arf_src_index;
}
-static void check_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
RATE_CONTROL *const rc = &cpi->rc;
if (cpi->oxcf.pass == 2) {
@@ -2441,7 +2544,7 @@ static void check_src_altref(VP9_COMP *cpi) {
(gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
} else {
rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
- (cpi->source == cpi->alt_ref_source);
+ (source == cpi->alt_ref_source);
}
if (rc->is_src_frame_alt_ref) {
@@ -2463,10 +2566,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
RATE_CONTROL *const rc = &cpi->rc;
struct vpx_usec_timer cmptimer;
YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+ struct lookahead_entry *last_source = NULL;
+ struct lookahead_entry *source = NULL;
MV_REFERENCE_FRAME ref_frame;
int arf_src_index;
- if (is_spatial_svc(cpi) && oxcf->pass == 2) {
+ if (is_two_pass_svc(cpi) && oxcf->pass == 2) {
#if CONFIG_SPATIAL_SVC
vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
#endif
@@ -2475,9 +2580,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
vpx_usec_timer_start(&cmptimer);
- cpi->source = NULL;
- cpi->last_source = NULL;
-
vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
// Normal defaults
@@ -2493,17 +2595,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
assert(arf_src_index <= rc->frames_to_key);
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
- arf_src_index, 0);
+ if (is_two_pass_svc(cpi))
+ source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0);
else
#endif
- cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
- if (cpi->source != NULL) {
- cpi->alt_ref_source = cpi->source;
+ source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
+ if (source != NULL) {
+ cpi->alt_ref_source = source;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
+ if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
int i;
// Reference a hidden frame from a lower layer
for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
@@ -2534,46 +2635,44 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
}
- if (!cpi->source) {
+ if (!source) {
// Get last frame source.
if (cm->current_video_frame > 0) {
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
+ if (is_two_pass_svc(cpi))
+ last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
else
#endif
- cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
- if (cpi->last_source == NULL)
+ last_source = vp9_lookahead_peek(cpi->lookahead, -1);
+ if (last_source == NULL)
return -1;
}
// Read in the source frame.
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+ if (is_two_pass_svc(cpi))
+ source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
else
#endif
- cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
- if (cpi->source != NULL) {
+ source = vp9_lookahead_pop(cpi->lookahead, flush);
+ if (source != NULL) {
cm->show_frame = 1;
cm->intra_only = 0;
// Check to see if the frame should be encoded as an arf overlay.
- check_src_altref(cpi);
+ check_src_altref(cpi, source);
}
}
- if (cpi->source) {
+ if (source) {
cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
- : &cpi->source->img;
+ : &source->img;
- cpi->unscaled_last_source = cpi->last_source != NULL ?
- &cpi->last_source->img : NULL;
+ cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
- *time_stamp = cpi->source->ts_start;
- *time_end = cpi->source->ts_end;
- *frame_flags =
- (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
} else {
*size = 0;
@@ -2584,9 +2683,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
return -1;
}
- if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
- cpi->first_time_stamp_ever = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
}
// Clear down mmx registers
@@ -2594,7 +2693,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
// adjust frame rates based on timestamps given
if (cm->show_frame) {
- adjust_frame_rate(cpi);
+ adjust_frame_rate(cpi, source);
}
if (cpi->svc.number_temporal_layers > 1 &&
@@ -2636,6 +2735,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
alloc_util_frame_buffers(cpi);
@@ -2662,13 +2764,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
if (oxcf->pass == 1 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
- vp9_first_pass(cpi);
+ vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
Pass2Encode(cpi, size, dest, frame_flags);
} else if (cpi->use_svc) {
SvcEncode(cpi, size, dest, frame_flags);
@@ -2691,8 +2793,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
// Save layer specific state.
if ((cpi->svc.number_temporal_layers > 1 &&
- oxcf->rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 && oxcf->pass == 2)) {
+ oxcf->rc_mode == VPX_CBR) ||
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ oxcf->pass == 2)) {
vp9_save_layer_context(cpi);
}
@@ -2744,12 +2848,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cpi->totalp_sq_error += psnr2.sse[0];
cpi->totalp_samples += psnr2.samples[0];
- frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+ frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
- frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+ frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
cpi->summedp_quality += frame_ssim2 * weight;
cpi->summedp_weights += weight;
@@ -2765,6 +2869,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
}
+
if (cpi->b_calculate_ssimg) {
double y, u, v, frame_all;
frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
diff --git a/source/libvpx/vp9/encoder/vp9_encoder.h b/source/libvpx/vp9/encoder/vp9_encoder.h
index 82be0f4..0d3c4c1 100644
--- a/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -114,9 +114,10 @@ typedef enum {
typedef struct VP9EncoderConfig {
BITSTREAM_PROFILE profile;
- BIT_DEPTH bit_depth;
+ vpx_bit_depth_t bit_depth; // Codec bit-depth.
int width; // width of data passed to the compressor
int height; // height of data passed to the compressor
+ unsigned int input_bit_depth; // Input bit depth.
double init_framerate; // set to passed in framerate
int64_t target_bandwidth; // bandwidth to be used in kilobits per second
@@ -203,16 +204,15 @@ typedef struct VP9EncoderConfig {
int arnr_max_frames;
int arnr_strength;
- int arnr_type;
int tile_columns;
int tile_rows;
- struct vpx_fixed_buf two_pass_stats_in;
- struct vpx_codec_pkt_list *output_pkt_list;
+ vpx_fixed_buf_t two_pass_stats_in;
+ struct vpx_codec_pkt_list *output_pkt_list;
#if CONFIG_FP_MB_STATS
- struct vpx_fixed_buf firstpass_mb_stats_in;
+ vpx_fixed_buf_t firstpass_mb_stats_in;
#endif
vp8e_tuning tuning;
@@ -223,19 +223,13 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
}
-static INLINE int is_best_mode(MODE mode) {
- return mode == BEST;
-}
-
typedef struct VP9_COMP {
QUANTS quants;
MACROBLOCK mb;
VP9_COMMON common;
VP9EncoderConfig oxcf;
struct lookahead_ctx *lookahead;
- struct lookahead_entry *source;
struct lookahead_entry *alt_ref_source;
- struct lookahead_entry *last_source;
YV12_BUFFER_CONFIG *Source;
YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames
@@ -275,6 +269,11 @@ typedef struct VP9_COMP {
CODING_CONTEXT coding_context;
+ int *nmvcosts[2];
+ int *nmvcosts_hp[2];
+ int *nmvsadcosts[2];
+ int *nmvsadcosts_hp[2];
+
int zbin_mode_boost;
int zbin_mode_boost_enabled;
@@ -286,6 +285,7 @@ typedef struct VP9_COMP {
double framerate;
vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+ int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
struct vpx_codec_pkt_list *output_pkt_list;
@@ -332,7 +332,7 @@ typedef struct VP9_COMP {
TWO_PASS twopass;
YV12_BUFFER_CONFIG alt_ref_buffer;
- YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
#if CONFIG_INTERNAL_STATS
unsigned int mode_chosen_counts[MAX_MODES];
@@ -371,10 +371,6 @@ typedef struct VP9_COMP {
int droppable;
- int dummy_packing; /* flag to indicate if packing is dummy */
-
- unsigned int tx_stepdown_count[TX_SIZES];
-
int initial_width;
int initial_height;
@@ -393,7 +389,7 @@ typedef struct VP9_COMP {
search_site_config ss_cfg;
int mbmode_cost[INTRA_MODES];
- unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+ unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
@@ -499,16 +495,17 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
-static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
return cpi->use_svc &&
- cpi->svc.number_temporal_layers == 1 &&
- cpi->svc.number_spatial_layers > 1;
+ (cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
}
static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
(cpi->oxcf.play_alternate &&
- (!is_spatial_svc(cpi) ||
+ (!is_two_pass_svc(cpi) ||
cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
}
@@ -525,6 +522,10 @@ static INLINE int get_chessboard_index(const int frame_index) {
return frame_index & 0x1;
}
+static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) {
+ return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.c b/source/libvpx/vp9/encoder/vp9_firstpass.c
index 94bbe9c..8041b59 100644
--- a/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -76,16 +76,6 @@ static void reset_fpf_position(TWO_PASS *p,
p->stats_in = position;
}
-static int lookup_next_frame_stats(const TWO_PASS *p,
- FIRSTPASS_STATS *next_frame) {
- if (p->stats_in >= p->stats_in_end)
- return EOF;
-
- *next_frame = *p->stats_in;
- return 1;
-}
-
-
// Read frame stats at an offset from the current position.
static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
@@ -256,7 +246,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
}
void vp9_end_first_pass(VP9_COMP *cpi) {
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
int i;
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
@@ -396,7 +386,7 @@ static void set_first_pass_params(VP9_COMP *cpi) {
cpi->rc.frames_to_key = INT_MAX;
}
-void vp9_first_pass(VP9_COMP *cpi) {
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
@@ -428,12 +418,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
int neutral_count = 0;
int new_mv_count = 0;
int sum_in_vectors = 0;
- uint32_t lastmv_as_int = 0;
+ MV lastmv = {0, 0};
TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = {0, 0};
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
- LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ?
- &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
+ LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -448,13 +438,13 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (lc != NULL) {
MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
- const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
twopass = &lc->twopass;
if (cpi->common.current_video_frame == 0) {
cpi->ref_frame_flags = 0;
} else {
- if (lc->current_video_frame_in_layer == 0)
+ if (lc->current_video_frame_in_layer <
+ (unsigned int)cpi->svc.number_temporal_layers)
cpi->ref_frame_flags = VP9_GOLD_FLAG;
else
cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
@@ -464,16 +454,17 @@ void vp9_first_pass(VP9_COMP *cpi) {
// Use either last frame or alt frame for motion search.
if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+ first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
ref_frame = LAST_FRAME;
+ if (first_ref_buf == NULL)
+ first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
} else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
ref_frame = GOLDEN_FRAME;
+ if (first_ref_buf == NULL)
+ first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
}
- if (scaled_ref_buf != NULL)
- first_ref_buf = scaled_ref_buf;
-
recon_y_stride = new_yv12->y_stride;
recon_uv_stride = new_yv12->uv_stride;
uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
@@ -512,9 +503,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
vp9_tile_init(&tile, cm, 0, 0);
for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
- int_mv best_ref_mv;
-
- best_ref_mv.as_int = 0;
+ MV best_ref_mv = {0, 0};
// Reset above block coeffs.
xd->up_available = (mb_row != 0);
@@ -594,14 +583,13 @@ void vp9_first_pass(VP9_COMP *cpi) {
// Other than for the first frame do a motion search.
if (cm->current_video_frame > 0) {
int tmp_err, motion_error, raw_motion_error;
- int_mv mv, tmp_mv;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = {0, 0} , tmp_mv = {0, 0};
struct buf_2d unscaled_last_source_buf_2d;
xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
- // Assume 0,0 motion with no mv overhead.
- mv.as_int = tmp_mv.as_int = 0;
// Compute the motion error of the 0,0 motion using the last source
// frame as the reference. Skip the further motion search on
@@ -617,8 +605,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (raw_motion_error > 25 || lc != NULL) {
// Test last reference frame using the previous best mv as the
// starting point (best reference) for the search.
- first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
- &motion_error);
+ first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
motion_error = (int)(motion_error * error_weight);
@@ -626,9 +613,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
// If the current best reference mv is not centered on 0,0 then do a
// 0,0 based search as well.
- if (best_ref_mv.as_int) {
+ if (!is_zero_mv(&best_ref_mv)) {
tmp_err = INT_MAX;
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
tmp_err = (int)(tmp_err * error_weight);
@@ -636,7 +623,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (tmp_err < motion_error) {
motion_error = tmp_err;
- mv.as_int = tmp_mv.as_int;
+ mv = tmp_mv;
}
}
@@ -649,7 +636,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
&gf_motion_error);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
@@ -680,7 +667,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
}
// Start by assuming that intra mode is best.
- best_ref_mv.as_int = 0;
+ best_ref_mv.row = 0;
+ best_ref_mv.col = 0;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -704,25 +692,25 @@ void vp9_first_pass(VP9_COMP *cpi) {
this_error < 2 * intrapenalty)
++neutral_count;
- mv.as_mv.row *= 8;
- mv.as_mv.col *= 8;
+ mv.row *= 8;
+ mv.col *= 8;
this_error = motion_error;
xd->mi[0]->mbmi.mode = NEWMV;
- xd->mi[0]->mbmi.mv[0] = mv;
+ xd->mi[0]->mbmi.mv[0].as_mv = mv;
xd->mi[0]->mbmi.tx_size = TX_4X4;
xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
xd->mi[0]->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
vp9_encode_sby_pass1(x, bsize);
- sum_mvr += mv.as_mv.row;
- sum_mvr_abs += abs(mv.as_mv.row);
- sum_mvc += mv.as_mv.col;
- sum_mvc_abs += abs(mv.as_mv.col);
- sum_mvrs += mv.as_mv.row * mv.as_mv.row;
- sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+ sum_mvr += mv.row;
+ sum_mvr_abs += abs(mv.row);
+ sum_mvc += mv.col;
+ sum_mvc_abs += abs(mv.col);
+ sum_mvrs += mv.row * mv.row;
+ sum_mvcs += mv.col * mv.col;
++intercount;
- best_ref_mv.as_int = mv.as_int;
+ best_ref_mv = mv;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -740,7 +728,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
}
#endif
- if (mv.as_int) {
+ if (!is_zero_mv(&mv)) {
++mvcount;
#if CONFIG_FP_MB_STATS
@@ -771,33 +759,33 @@ void vp9_first_pass(VP9_COMP *cpi) {
#endif
// Non-zero vector, was it different from the last non zero vector?
- if (mv.as_int != lastmv_as_int)
+ if (!is_equal_mv(&mv, &lastmv))
++new_mv_count;
- lastmv_as_int = mv.as_int;
+ lastmv = mv;
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
+ if (mv.row > 0)
--sum_in_vectors;
- else if (mv.as_mv.row < 0)
+ else if (mv.row < 0)
++sum_in_vectors;
} else if (mb_row > cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
+ if (mv.row > 0)
++sum_in_vectors;
- else if (mv.as_mv.row < 0)
+ else if (mv.row < 0)
--sum_in_vectors;
}
// Does the col vector point inwards or outwards?
if (mb_col < cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
+ if (mv.col > 0)
--sum_in_vectors;
- else if (mv.as_mv.col < 0)
+ else if (mv.col < 0)
++sum_in_vectors;
} else if (mb_col > cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
+ if (mv.col > 0)
++sum_in_vectors;
- else if (mv.as_mv.col < 0)
+ else if (mv.col < 0)
--sum_in_vectors;
}
}
@@ -865,7 +853,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
// TODO(paulwilkins): Handle the case when duration is set to 0, or
// something less than the full time between subsequent values of
// cpi->source_time_stamp.
- fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
+ fps.duration = (double)(source->ts_end - source->ts_start);
// Don't want to do output stats with a stack variable!
twopass->this_frame_stats = fps;
@@ -927,7 +915,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
++cm->current_video_frame;
if (cpi->use_svc)
- vp9_inc_frame_in_layer(&cpi->svc);
+ vp9_inc_frame_in_layer(cpi);
}
static double calc_correction_factor(double err_per_mb,
@@ -965,7 +953,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
BPER_MB_NORMBITS) / num_mbs;
int q;
int is_svc_upper_layer = 0;
- if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0)
+ if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
is_svc_upper_layer = 1;
// Try and pick a max Q that will be high enough to encode the
@@ -993,9 +981,9 @@ extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
void vp9_init_second_pass(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
- (svc->number_temporal_layers == 1);
- TWO_PASS *const twopass = is_spatial_svc ?
+ const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
+ (svc->number_temporal_layers > 1);
+ TWO_PASS *const twopass = is_two_pass_svc ?
&svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
double frame_rate;
FIRSTPASS_STATS *stats;
@@ -1018,7 +1006,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// It is calculated based on the actual durations of all frames from the
// first pass.
- if (is_spatial_svc) {
+ if (is_two_pass_svc) {
vp9_update_spatial_layer_framerate(cpi, frame_rate);
twopass->bits_left = (int64_t)(stats->duration *
svc->layer_context[svc->spatial_layer_id].target_bandwidth /
@@ -1033,7 +1021,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// scores used in the second pass. We have this minimum to make sure
// that clips that are static but "low complexity" in the intra domain
// are still boosted appropriately for KF/GF/ARF.
- if (!is_spatial_svc) {
+ if (!is_two_pass_svc) {
// We don't know the number of MBs for each layer at this point.
// So we will do it later.
twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
@@ -1381,6 +1369,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
int mid_boost_bits = 0;
int mid_frame_idx;
unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+ int alt_frame_index = frame_index;
+ int has_temporal_layers = is_two_pass_svc(cpi) &&
+ cpi->svc.number_temporal_layers > 1;
+
+ // Only encode alt reference frame in temporal base layer.
+ if (has_temporal_layers)
+ alt_frame_index = cpi->svc.number_temporal_layers;
key_frame = cpi->common.frame_type == KEY_FRAME ||
vp9_is_upper_layer_key_frame(cpi);
@@ -1416,16 +1411,24 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
// Store the bits to spend on the ARF if there is one.
if (rc->source_alt_ref_pending) {
- gf_group->update_type[frame_index] = ARF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- gf_group->bit_allocation[frame_index] = gf_arf_bits;
- gf_group->arf_src_offset[frame_index] =
- (unsigned char)(rc->baseline_gf_interval - 1);
- gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
- gf_group->arf_ref_idx[frame_index] =
+ gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+ gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+ gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+
+ if (has_temporal_layers)
+ gf_group->arf_src_offset[alt_frame_index] =
+ (unsigned char)(rc->baseline_gf_interval -
+ cpi->svc.number_temporal_layers);
+ else
+ gf_group->arf_src_offset[alt_frame_index] =
+ (unsigned char)(rc->baseline_gf_interval - 1);
+
+ gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[alt_frame_index] =
arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
rc->source_alt_ref_active];
- ++frame_index;
+ if (!has_temporal_layers)
+ ++frame_index;
if (cpi->multi_arf_enabled) {
// Set aside a slot for a level 1 arf.
@@ -1448,6 +1451,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
if (EOF == input_stats(twopass, &frame_stats))
break;
+ if (has_temporal_layers && frame_index == alt_frame_index) {
+ ++frame_index;
+ }
+
modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
if (group_error > 0)
@@ -1669,6 +1676,21 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
else
rc->baseline_gf_interval = i;
+ // Only encode alt reference frame in temporal base layer. So
+ // baseline_gf_interval should be multiple of a temporal layer group
+ // (typically the frame distance between two base layer frames)
+ if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+ int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+ int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
+ int j;
+ for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
+ if (EOF == input_stats(twopass, this_frame))
+ break;
+ gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+ }
+ rc->baseline_gf_interval = new_gf_interval;
+ }
+
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
// Should we use the alternate reference frame.
@@ -1874,16 +1896,17 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
input_stats(twopass, this_frame);
// Provided that we are not at the end of the file...
- if (cpi->oxcf.auto_key &&
- lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
double loop_decay_rate;
// Check for a scene cut.
- if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+ if (test_candidate_kf(twopass, &last_frame, this_frame,
+ twopass->stats_in))
break;
// How fast is the prediction quality decaying?
- loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+ twopass->stats_in);
// We want to know something about the recent past... rather than
// as used elsewhere where we are concerned with decay in prediction
@@ -1940,6 +1963,18 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->next_key_frame_forced = 0;
}
+ if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+ int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+ int new_frame_to_key = (rc->frames_to_key + count) & (~count);
+ int j;
+ for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
+ if (EOF == input_stats(twopass, this_frame))
+ break;
+ kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+ }
+ rc->frames_to_key = new_frame_to_key;
+ }
+
// Special case for the last key frame of the file.
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error.
@@ -2098,7 +2133,7 @@ void configure_buffer_updates(VP9_COMP *cpi) {
assert(0);
break;
}
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
cpi->refresh_golden_frame = 0;
if (cpi->alt_ref_source == NULL)
@@ -2117,7 +2152,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
FIRSTPASS_STATS this_frame_copy;
int target_rate;
- LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ?
+ LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
&cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
if (lc != NULL) {
@@ -2200,15 +2235,18 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
if (lc != NULL) {
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = (cm->frame_type == KEY_FRAME);
- if (lc->is_key_frame)
+ if (lc->is_key_frame) {
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+ lc->frames_from_key_frame = 0;
+ }
} else {
cm->frame_type = INTER_FRAME;
lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
if (lc->is_key_frame) {
cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+ lc->frames_from_key_frame = 0;
}
}
}
diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.h b/source/libvpx/vp9/encoder/vp9_firstpass.h
index bf8c9fd..aaa6b03 100644
--- a/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -121,7 +121,7 @@ struct VP9_COMP;
void vp9_init_first_pass(struct VP9_COMP *cpi);
void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
-void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
void vp9_end_first_pass(struct VP9_COMP *cpi);
void vp9_init_second_pass(struct VP9_COMP *cpi);
diff --git a/source/libvpx/vp9/encoder/vp9_lookahead.c b/source/libvpx/vp9/encoder/vp9_lookahead.c
index e743517..823e7a1 100644
--- a/source/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/source/libvpx/vp9/encoder/vp9_lookahead.c
@@ -50,6 +50,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
unsigned int height,
unsigned int subsampling_x,
unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
unsigned int depth) {
struct lookahead_ctx *ctx = NULL;
@@ -70,6 +73,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
for (i = 0; i < depth; i++)
if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS))
goto bail;
}
diff --git a/source/libvpx/vp9/encoder/vp9_lookahead.h b/source/libvpx/vp9/encoder/vp9_lookahead.h
index 678c51a..2786193 100644
--- a/source/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/source/libvpx/vp9/encoder/vp9_lookahead.h
@@ -56,6 +56,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
unsigned int height,
unsigned int subsampling_x,
unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
unsigned int depth);
diff --git a/source/libvpx/vp9/encoder/vp9_mbgraph.c b/source/libvpx/vp9/encoder/vp9_mbgraph.c
index 6e04e2a..b8e7164 100644
--- a/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/source/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -34,6 +34,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
const int tmp_row_min = x->mv_row_min;
const int tmp_row_max = x->mv_row_max;
MV ref_full;
+ int sad_list[5];
// Further step/diamond searches as necessary
int step_param = mv_sf->reduce_first_step_size;
@@ -45,8 +46,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
ref_full.row = ref_mv->row >> 3;
/*cpi->sf.search_method == HEX*/
- vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
- ref_mv, dst_mv);
+ vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+ cond_sad_list(cpi, sad_list),
+ &v_fn_ptr, 0, ref_mv, dst_mv);
// Try sub-pixel MC
// if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -55,8 +57,10 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
unsigned int sse;
cpi->find_fractional_mv_step(
x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
- &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
- &sse, NULL, 0, 0);
+ &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
+ NULL, NULL,
+ &distortion, &sse, NULL, 0, 0);
}
xd->mi[0]->mbmi.mode = NEWMV;
diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.c b/source/libvpx/vp9/encoder/vp9_mcomp.c
index ae924d5..d6f6b25 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -256,6 +256,137 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
} \
}
+#define SETUP_SUBPEL_SEARCH \
+ const uint8_t *const z = x->plane[0].src.buf; \
+ const int src_stride = x->plane[0].src.stride; \
+ const MACROBLOCKD *xd = &x->e_mbd; \
+ unsigned int besterr = INT_MAX; \
+ unsigned int sse; \
+ unsigned int whichdir; \
+ int thismse; \
+ const unsigned int halfiters = iters_per_step; \
+ const unsigned int quarteriters = iters_per_step; \
+ const unsigned int eighthiters = iters_per_step; \
+ const int y_stride = xd->plane[0].pre[0].stride; \
+ const int offset = bestmv->row * y_stride + bestmv->col; \
+ const uint8_t *const y = xd->plane[0].pre[0].buf; \
+ \
+ int rr = ref_mv->row; \
+ int rc = ref_mv->col; \
+ int br = bestmv->row * 8; \
+ int bc = bestmv->col * 8; \
+ int hstep = 4; \
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); \
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); \
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); \
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); \
+ int tr = br; \
+ int tc = bc; \
+ \
+ bestmv->row *= 8; \
+ bestmv->col *= 8; \
+ if (second_pred != NULL) { \
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
+ } else { \
+ besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
+ } \
+ *distortion = besterr; \
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *sad_list,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ SETUP_SUBPEL_SEARCH;
+
+ if (sad_list &&
+ sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+ sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+ sad_list[4] != INT_MAX) {
+ unsigned int left, right, up, down, diag;
+ whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) +
+ (sad_list[2] < sad_list[4] ? 0 : 2);
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void) tr;
+ (void) tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+
int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
int allow_hp,
@@ -263,55 +394,14 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
int iters_per_step,
+ int *sad_list,
int *mvjcost, int *mvcost[2],
int *distortion,
unsigned int *sse1,
const uint8_t *second_pred,
int w, int h) {
- const uint8_t *const z = x->plane[0].src.buf;
- const int src_stride = x->plane[0].src.stride;
- const MACROBLOCKD *xd = &x->e_mbd;
- unsigned int besterr = INT_MAX;
- unsigned int sse;
- unsigned int whichdir;
- int thismse;
- const unsigned int halfiters = iters_per_step;
- const unsigned int quarteriters = iters_per_step;
- const unsigned int eighthiters = iters_per_step;
-
- const int y_stride = xd->plane[0].pre[0].stride;
- const int offset = bestmv->row * y_stride + bestmv->col;
- const uint8_t *const y = xd->plane[0].pre[0].buf;
-
- int rr = ref_mv->row;
- int rc = ref_mv->col;
- int br = bestmv->row * 8;
- int bc = bestmv->col * 8;
- int hstep = 4;
- const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
- const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
- const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
- const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
- int tr = br;
- int tc = bc;
-
- // central mv
- bestmv->row *= 8;
- bestmv->col *= 8;
-
- // calculate central point error
- // TODO(yunqingwang): central pointer error was already calculated in full-
- // pixel search, and can be passed in this function.
- if (second_pred != NULL) {
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
- } else {
- besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
- }
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ SETUP_SUBPEL_SEARCH;
+ (void) sad_list; // to silence compiler warning
// Each subsequent iteration checks at least one point in
// common with the last iteration could be 2 ( if diag selected)
@@ -398,14 +488,17 @@ static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
// Each scale can have a different number of candidates and shape of
// candidates as indicated in the num_candidates and candidates arrays
// passed into this function
+//
static int vp9_pattern_search(const MACROBLOCK *x,
MV *ref_mv,
int search_param,
int sad_per_bit,
- int do_init_search, int do_refine,
+ int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- const MV *center_mv, MV *best_mv,
+ const MV *center_mv,
+ MV *best_mv,
const int num_candidates[MAX_PATTERN_SCALES],
const MV candidates[MAX_PATTERN_SCALES]
[MAX_PATTERN_CANDIDATES]) {
@@ -413,7 +506,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
};
- int i, j, s, t;
+ int i, s, t;
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
int br, bc;
@@ -552,47 +645,38 @@ static int vp9_pattern_search(const MACROBLOCK *x,
} while (s--);
}
- // Check 4 1-away neighbors if do_refine is true.
- // For most well-designed schemes do_refine will not be necessary.
- if (do_refine) {
- static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-
- for (j = 0; j < 16; j++) {
- int best_site = -1;
- if (check_bounds(x, br, bc, 1)) {
- for (i = 0; i < 4; i++) {
- const MV this_mv = {br + neighbors[i].row,
- bc + neighbors[i].col};
- thissad = vfp->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &this_mv),
- in_what->stride);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 4; i++) {
- const MV this_mv = {br + neighbors[i].row,
- bc + neighbors[i].col};
- if (!is_mv_in(x, &this_mv))
- continue;
- thissad = vfp->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &this_mv),
- in_what->stride);
- CHECK_BETTER
- }
+ // Returns the one-away integer pel sad values around the best as follows:
+ // sad_list[0]: sad at the best integer pel
+ // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel
+ // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel
+ // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel
+ // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel
+ if (sad_list) {
+ static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}};
+ sad_list[0] = bestsad;
+ if (check_bounds(x, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
}
-
- if (best_site == -1) {
- break;
- } else {
- br += neighbors[best_site].row;
- bc += neighbors[best_site].col;
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ if (!is_mv_in(x, &this_mv))
+ sad_list[i + 1] = INT_MAX;
+ else
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
}
}
}
-
best_mv->row = br;
best_mv->col = bc;
-
return bestsad;
}
@@ -634,6 +718,7 @@ int vp9_hex_search(const MACROBLOCK *x,
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv, MV *best_mv) {
@@ -658,7 +743,7 @@ int vp9_hex_search(const MACROBLOCK *x,
{ -1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
hex_num_candidates, hex_candidates);
}
@@ -668,6 +753,7 @@ int vp9_bigdia_search(const MACROBLOCK *x,
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
@@ -699,7 +785,7 @@ int vp9_bigdia_search(const MACROBLOCK *x,
{-512, 512}, {-1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
bigdia_num_candidates, bigdia_candidates);
}
@@ -709,6 +795,7 @@ int vp9_square_search(const MACROBLOCK *x,
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
@@ -740,7 +827,7 @@ int vp9_square_search(const MACROBLOCK *x,
{0, 1024}, {-1024, 1024}, {-1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
square_num_candidates, square_candidates);
}
@@ -750,12 +837,13 @@ int vp9_fast_hex_search(const MACROBLOCK *x,
int search_param,
int sad_per_bit,
int do_init_search, // must be zero for fast_hex
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
MV *best_mv) {
return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
- sad_per_bit, do_init_search, vfp, use_mvcost,
+ sad_per_bit, do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv);
}
@@ -764,13 +852,14 @@ int vp9_fast_dia_search(const MACROBLOCK *x,
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
MV *best_mv) {
return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
- sad_per_bit, do_init_search, vfp, use_mvcost,
- center_mv, best_mv);
+ sad_per_bit, do_init_search, sad_list, vfp,
+ use_mvcost, center_mv, best_mv);
}
#undef CHECK_BETTER
@@ -1368,33 +1457,41 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full,
int step_param, int error_per_bit,
+ int *sad_list,
const MV *ref_mv, MV *tmp_mv,
int var_max, int rd) {
const SPEED_FEATURES *const sf = &cpi->sf;
const SEARCH_METHODS method = sf->mv.search_method;
vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
+ if (sad_list) {
+ sad_list[0] = INT_MAX;
+ sad_list[1] = INT_MAX;
+ sad_list[2] = INT_MAX;
+ sad_list[3] = INT_MAX;
+ sad_list[4] = INT_MAX;
+ }
switch (method) {
case FAST_DIAMOND:
var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case FAST_HEX:
var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case HEX:
var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case SQUARE:
var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case BIGDIA:
var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case NSTEP:
var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.h b/source/libvpx/vp9/encoder/vp9_mcomp.h
index 298fbb6..9b4734a 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.h
@@ -79,6 +79,7 @@ typedef int (integer_mv_pattern_search_fn) (
int search_param,
int error_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vf,
int use_mvcost,
const MV *center_mv,
@@ -98,12 +99,14 @@ typedef int (fractional_mv_step_fp) (
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
int iters_per_step,
+ int *sad_list,
int *mvjcost, int *mvcost[2],
int *distortion, unsigned int *sse1,
const uint8_t *second_pred,
int w, int h);
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
const MV *ref_mv, int sad_per_bit,
@@ -136,8 +139,10 @@ struct VP9_COMP;
int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full,
int step_param, int error_per_bit,
+ int *sad_list,
const MV *ref_mv, MV *tmp_mv,
int var_max, int rd);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/source/libvpx/vp9/encoder/vp9_pickmode.c b/source/libvpx/vp9/encoder/vp9_pickmode.c
index 5646f5b..eee6ffe 100644
--- a/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -126,6 +126,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
const int tmp_row_min = x->mv_row_min;
const int tmp_row_max = x->mv_row_max;
int rv = 0;
+ int sad_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
if (cpi->common.show_frame &&
@@ -152,8 +153,9 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
- &tmp_mv->as_mv, INT_MAX, 0);
+ vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cond_sad_list(cpi, sad_list),
+ &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
@@ -179,6 +181,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
&cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
x->pred_mv[ref] = tmp_mv->as_mv;
@@ -391,7 +394,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
args->dist += dist;
}
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
{THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
{THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
{THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
@@ -420,7 +423,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
VP9_ALT_FLAG };
int64_t best_rd = INT64_MAX;
int64_t this_rd = INT64_MAX;
- int skip_txfm = 0;
+ uint8_t skip_txfm = 0;
int rate = INT_MAX;
int64_t dist = INT64_MAX;
// var_y and sse_y are saved to be used in skipping checking
@@ -544,7 +547,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
continue;
mode_rd_thresh = rd_threshes[mode_idx[ref_frame - LAST_FRAME]
- [this_mode - NEARESTMV]];
+ [INTER_OFFSET(this_mode)]];
if (rd_less_than_thresh(best_rd, mode_rd_thresh,
rd_thresh_freq_fact[this_mode]))
continue;
@@ -656,8 +659,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
- this_mode, ctx);
+ vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
}
#endif
diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.c b/source/libvpx/vp9/encoder/vp9_ratectrl.c
index b926a58..b607c85 100644
--- a/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -1235,7 +1235,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
cm->frame_type = KEY_FRAME;
rc->source_alt_ref_active = 0;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
@@ -1247,7 +1247,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
} else {
cm->frame_type = INTER_FRAME;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = 0;
diff --git a/source/libvpx/vp9/encoder/vp9_rd.c b/source/libvpx/vp9/encoder/vp9_rd.c
index 4fc3e9e..b826ff4 100644
--- a/source/libvpx/vp9/encoder/vp9_rd.c
+++ b/source/libvpx/vp9/encoder/vp9_rd.c
@@ -364,20 +364,16 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
int ref_frame, BLOCK_SIZE block_size) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- int_mv this_mv;
int i;
int zero_seen = 0;
int best_index = 0;
int best_sad = INT_MAX;
int this_sad = INT_MAX;
int max_mv = 0;
-
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
- int row_offset, col_offset;
- int num_mv_refs = MAX_MV_REF_CANDIDATES +
+ const int num_mv_refs = MAX_MV_REF_CANDIDATES +
(cpi->sf.adaptive_motion_search &&
- cpi->common.show_frame &&
block_size < cpi->sf.max_partition_size);
MV pred_mv[3];
@@ -387,19 +383,16 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
// Get the sad for each candidate reference mv.
for (i = 0; i < num_mv_refs; ++i) {
- this_mv.as_mv = pred_mv[i];
+ const MV *this_mv = &pred_mv[i];
- max_mv = MAX(max_mv,
- MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
- // Only need to check zero mv once.
- if (!this_mv.as_int && zero_seen)
+ max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+ if (is_zero_mv(this_mv) && zero_seen)
continue;
- zero_seen = zero_seen || !this_mv.as_int;
+ zero_seen |= is_zero_mv(this_mv);
- row_offset = this_mv.as_mv.row >> 3;
- col_offset = this_mv.as_mv.col >> 3;
- ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
+ ref_y_ptr =
+ &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
// Find sad for current vector.
this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
@@ -462,7 +455,7 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
// Set baseline threshold values.
for (i = 0; i < MAX_MODES; ++i)
- rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+ rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
rd->thresh_mult[THR_NEARESTMV] = 0;
rd->thresh_mult[THR_NEARESTG] = 0;
@@ -548,7 +541,7 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
int i;
for (i = 0; i < MAX_REFS; ++i)
- rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+ rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
rd->thresh_mult_sub8x8[THR_LAST] += 2500;
rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.c b/source/libvpx/vp9/encoder/vp9_rdopt.c
index cfda964..506c9bc 100644
--- a/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -171,30 +171,53 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
int64_t dist_sum = 0;
const int ref = xd->mi[0]->mbmi.ref_frame[0];
unsigned int sse;
+ unsigned int var = 0;
+ unsigned int sum_sse = 0;
const int shift = 8;
+ int rate;
+ int64_t dist;
+
+ x->pred_sse[ref] = 0;
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+ int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int idx, idy;
+ int lw = b_width_log2_lookup[unit_size] + 2;
+ int lh = b_height_log2_lookup[unit_size] + 2;
+
+ sum_sse = 0;
+
+ for (idy = 0; idy < bh; ++idy) {
+ for (idx = 0; idx < bw; ++idx) {
+ uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+ uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+ int block_idx = (idy << 1) + idx;
+
+ var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+ dst, pd->dst.stride, &sse);
+ x->bsse[(i << 2) + block_idx] = sse;
+ sum_sse += sse;
+
+ if (!x->select_tx_size) {
+ if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 1;
+ else if (var < p->quant_thred[1] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 2;
+ else
+ x->skip_txfm[(i << 2) + block_idx] = 0;
+ }
- const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride,
- &sse);
-
- if (!x->select_tx_size) {
- if (sse < p->quant_thred[0] >> shift)
- x->skip_txfm[i] = 1;
- else if (var < p->quant_thred[1] >> shift)
- x->skip_txfm[i] = 2;
- else
- x->skip_txfm[i] = 0;
+ if (i == 0)
+ x->pred_sse[ref] += sse;
+ }
}
- x->bsse[i] = sse;
- if (i == 0)
- x->pred_sse[ref] = sse;
-
// Fast approximate the modelling function.
if (cpi->oxcf.speed > 4) {
int64_t rate;
@@ -210,9 +233,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
rate_sum += rate;
dist_sum += dist;
} else {
- int rate;
- int64_t dist;
- vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
@@ -372,17 +393,17 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
if (!is_inter_block(mbmi)) {
vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
dist_block(plane, block, tx_size, args);
- } else {
- if (x->skip_txfm[plane] == 0) {
+ } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
dist_block(plane, block, tx_size, args);
- } else if (x->skip_txfm[plane] == 2) {
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
// compute DC coefficient
int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
if (!x->plane[plane].eobs[block])
args->dist = args->sse - ((coeff[0] * coeff[0] -
@@ -390,9 +411,13 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
} else {
// skip forward transform
x->plane[plane].eobs[block] = 0;
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
}
+ } else {
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ dist_block(plane, block, tx_size, args);
}
rate_block(plane, block, plane_bsize, tx_size, args);
@@ -468,7 +493,6 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
txfm_rd_in_plane(x, rate, distortion, skip,
sse, ref_best_rd, 0, bs,
mbmi->tx_size, cpi->sf.use_fast_coef_costing);
- cpi->tx_stepdown_count[0]++;
}
static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -551,60 +575,36 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
- cpi->tx_stepdown_count[0]++;
} else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
- cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
- cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
} else {
tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
- cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
}
}
-static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int64_t *distortion, int *skip,
- int64_t *psse, BLOCK_SIZE bs,
- int64_t txfm_cache[TX_MODES],
- int64_t ref_best_rd) {
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skip,
+ int64_t *psse, BLOCK_SIZE bs,
+ int64_t txfm_cache[TX_MODES],
+ int64_t ref_best_rd) {
MACROBLOCKD *xd = &x->e_mbd;
+ int64_t sse;
+ int64_t *ret_sse = psse ? psse : &sse;
assert(bs == xd->mi[0]->mbmi.sb_type);
- vp9_subtract_plane(x, bs, 0);
-
if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
+ choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
bs);
} else {
- choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
+ choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
txfm_cache, ref_best_rd, bs);
}
}
-static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int64_t *distortion, int *skip,
- BLOCK_SIZE bs,
- int64_t txfm_cache[TX_MODES],
- int64_t ref_best_rd) {
- MACROBLOCKD *xd = &x->e_mbd;
- int64_t sse;
-
- assert(bs == xd->mi[0]->mbmi.sb_type);
- if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
- vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
- bs);
- } else {
- choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
- txfm_cache, ref_best_rd, bs);
- }
-}
-
-
static int conditional_skipintra(PREDICTION_MODE mode,
PREDICTION_MODE best_intra_mode) {
if (mode == D117_PRED &&
@@ -854,8 +854,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
mic->mbmi.mode = mode;
- intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
- &s, bsize, local_tx_cache, best_rd);
+ super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+ &s, NULL, bsize, local_tx_cache, best_rd);
if (this_rate_tokenonly == INT_MAX)
continue;
@@ -1365,13 +1365,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
int sadpb = x->sadperbit4;
MV mvp_full;
int max_mv;
+ int sad_list[5];
/* Is the best so far sufficiently good that we cant justify doing
* and new motion search. */
if (best_rd < label_mv_thresh)
break;
- if (!is_best_mode(cpi->oxcf.mode)) {
+ if (cpi->oxcf.mode != BEST) {
// use previous block's result as next block's MV predictor.
if (i > 0) {
bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1397,7 +1398,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
mvp_full.row = bsi->mvp.as_mv.row >> 3;
mvp_full.col = bsi->mvp.as_mv.col >> 3;
- if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+ if (cpi->sf.adaptive_motion_search) {
mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
step_param = MAX(step_param, 8);
@@ -1408,12 +1409,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
- bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
- sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
- INT_MAX, 1);
+ bestsme = vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL,
+ &bsi->ref_mv[0]->as_mv, new_mv,
+ INT_MAX, 1);
// Should we do a full search (best quality only)
- if (is_best_mode(cpi->oxcf.mode)) {
+ if (cpi->oxcf.mode == BEST) {
int_mv *const best_mv = &mi->bmi[i].as_mv[0];
/* Check if mvp_full is within the range. */
clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
@@ -1422,6 +1425,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
sadpb, 16, &cpi->fn_ptr[bsize],
&bsi->ref_mv[0]->as_mv,
&best_mv->as_mv);
+ sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
if (thissme < bestsme) {
bestsme = thissme;
*new_mv = best_mv->as_mv;
@@ -1434,17 +1438,19 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int distortion;
- cpi->find_fractional_mv_step(x,
- new_mv,
- &bsi->ref_mv[0]->as_mv,
- cm->allow_high_precision_mv,
- x->errorperbit, &cpi->fn_ptr[bsize],
- cpi->sf.mv.subpel_force_stop,
- cpi->sf.mv.subpel_iters_per_step,
- x->nmvjointcost, x->mvcost,
- &distortion,
- &x->pred_sse[mbmi->ref_frame[0]],
- NULL, 0, 0);
+ cpi->find_fractional_mv_step(
+ x,
+ new_mv,
+ &bsi->ref_mv[0]->as_mv,
+ cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
+ x->nmvjointcost, x->mvcost,
+ &distortion,
+ &x->pred_sse[mbmi->ref_frame[0]],
+ NULL, 0, 0);
// save motion search result for use in compound prediction
seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1701,12 +1707,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int mode_index,
int64_t comp_pred_diff[REFERENCE_MODES],
const int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+ int skippable) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
// restored if we decide to encode this way
ctx->skip = x->skip;
+ ctx->skippable = skippable;
ctx->best_mode_index = mode_index;
ctx->mic = *xd->mi[0];
ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
@@ -1772,6 +1780,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int tmp_col_max = x->mv_col_max;
int tmp_row_min = x->mv_row_min;
int tmp_row_max = x->mv_row_max;
+ int sad_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
@@ -1806,8 +1815,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
step_param = cpi->mv_step_param;
}
- if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
- cm->show_frame) {
+ if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
b_width_log2(bsize)));
step_param = MAX(step_param, boffset);
@@ -1844,6 +1852,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
mvp_full.row >>= 3;
bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cond_sad_list(cpi, sad_list),
&ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
x->mv_col_min = tmp_col_min;
@@ -1859,13 +1868,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
&cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
}
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
- if (cpi->sf.adaptive_motion_search && cm->show_frame)
+ if (cpi->sf.adaptive_motion_search)
x->pred_mv[ref] = tmp_mv->as_mv;
if (scaled_ref_frame) {
@@ -1983,6 +1993,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
x->errorperbit,
&cpi->fn_ptr[bsize],
0, cpi->sf.mv.subpel_iters_per_step,
+ NULL,
x->nmvjointcost, x->mvcost,
&dis, &sse, second_pred,
pw, ph);
@@ -2118,6 +2129,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
+ INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+ int (*single_skippable)[MAX_REF_FRAMES],
int64_t *psse,
const int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
@@ -2135,14 +2148,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
int intpel_mv;
- int64_t rd, best_rd = INT64_MAX;
+ int64_t rd, tmp_rd, best_rd = INT64_MAX;
int best_needs_copy = 0;
uint8_t *orig_dst[MAX_MB_PLANE];
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
INTERP_FILTER best_filter = SWITCHABLE;
- int skip_txfm[MAX_MB_PLANE] = {0};
- int64_t bsse[MAX_MB_PLANE] = {0};
+ uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+ int64_t bsse[MAX_MB_PLANE << 2] = {0};
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
@@ -2164,6 +2177,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (frame_mv[refs[0]].as_int == INVALID_MV ||
frame_mv[refs[1]].as_int == INVALID_MV)
return INT64_MAX;
+
+ if (cpi->sf.adaptive_mode_search) {
+ if (single_filter[this_mode][refs[0]] ==
+ single_filter[this_mode][refs[1]])
+ best_filter = single_filter[this_mode][refs[0]];
+ }
}
if (this_mode == NEWMV) {
@@ -2225,6 +2244,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
* if the first is known */
*rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV)
+ return INT64_MAX;
+
pred_exists = 0;
// Are all MVs integer pel for Y and UV
intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
@@ -2263,6 +2286,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
} else {
int rate_sum = 0;
int64_t dist_sum = 0;
+ if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << i))) {
+ rate_sum = INT_MAX;
+ dist_sum = INT64_MAX;
+ continue;
+ }
+
if ((cm->interp_filter == SWITCHABLE &&
(!i || best_needs_copy)) ||
(cm->interp_filter != SWITCHABLE &&
@@ -2313,6 +2343,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(cm->interp_filter != SWITCHABLE &&
cm->interp_filter == mbmi->interp_filter)) {
pred_exists = 1;
+ tmp_rd = best_rd;
}
}
restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2331,17 +2362,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
xd->plane[i].dst.stride = 64;
}
}
+ rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
} else {
+ int tmp_rate;
+ int64_t tmp_dist;
// Handles the special case when a filter that is not in the
- // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
+ // switchable list (ex. bilinear) is indicated at the frame level, or
+ // skip condition holds.
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ vpx_memcpy(bsse, x->bsse, sizeof(bsse));
}
+ if (!is_comp_pred)
+ single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+ if (cpi->sf.adaptive_mode_search)
+ if (is_comp_pred)
+ if (single_skippable[this_mode][refs[0]] &&
+ single_skippable[this_mode][refs[1]])
+ vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+
if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
- int tmp_rate;
- int64_t tmp_dist;
- model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
- rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
// if current pred_error modeled rd is substantially more than the best
// so far, do not bother doing full rd
if (rd / 2 > ref_best_rd) {
@@ -2351,7 +2395,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
if (cm->interp_filter == SWITCHABLE)
- *rate2 += vp9_get_switchable_rate(cpi);
+ *rate2 += rs;
if (!is_comp_pred) {
if (cpi->allow_encode_breakout)
@@ -2368,8 +2412,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int64_t rdcosty = INT64_MAX;
// Y cost and distortion
- inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
- bsize, txfm_cache, ref_best_rd);
+ vp9_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+ bsize, txfm_cache, ref_best_rd);
if (*rate_y == INT_MAX) {
*rate2 = INT_MAX;
@@ -2399,6 +2444,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
*skippable = skippable_y && skippable_uv;
}
+ if (!is_comp_pred)
+ single_skippable[this_mode][refs[0]] = *skippable;
+
restore_dst_buf(xd, orig_dst, orig_dst_stride);
return this_rd; // if 0, this will be re-calculated by caller
}
@@ -2505,10 +2553,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
- int comp_pred, i;
+ int comp_pred, i, k;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+ INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
int64_t best_rd = best_rd_so_far;
@@ -2519,6 +2569,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode;
+ int best_mode_skippable = 0;
int mode_index, best_mode_index = -1;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
@@ -2556,6 +2607,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
rate_uv_intra[i] = INT_MAX;
for (i = 0; i < MAX_REF_FRAMES; ++i)
x->pred_sse[i] = INT_MAX;
+ for (i = 0; i < MB_MODE_COUNT; ++i) {
+ for (k = 0; k < MAX_REF_FRAMES; ++k) {
+ single_inter_filter[i][k] = SWITCHABLE;
+ single_skippable[i][k] = 0;
+ }
+ }
*returnrate = INT_MAX;
@@ -2732,6 +2789,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
+ if (!cm->allow_comp_inter_inter)
+ continue;
+
if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
best_mode_index >=0 &&
vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
@@ -2747,6 +2807,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (ref_frame == INTRA_FRAME) {
+ if (cpi->sf.adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_intra_rd)
+ continue;
+
if (!(intra_y_mode_mask & (1 << this_mode)))
continue;
if (this_mode != DC_PRED) {
@@ -2785,6 +2849,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// them for this frame.
mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
: cm->interp_filter;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
x->skip = 0;
set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
@@ -2800,8 +2866,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
- intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
- bsize, tx_cache, best_rd);
+ super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+ NULL, bsize, tx_cache, best_rd);
if (rate_y == INT_MAX)
continue;
@@ -2831,7 +2897,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&rate_uv, &distortion_uv,
&disable_skip, frame_mv,
mi_row, mi_col,
- single_newmv, &total_sse, best_rd);
+ single_newmv, single_inter_filter,
+ single_skippable, &total_sse, best_rd);
if (this_rd == INT64_MAX)
continue;
@@ -2919,6 +2986,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
max_plane = 1;
+ } else {
+ best_intra_rd = x->pred_sse[ref_frame];
}
*returnrate = rate2;
@@ -2926,6 +2995,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_rd = this_rd;
best_mbmode = *mbmi;
best_skip2 = this_skip2;
+ best_mode_skippable = skippable;
+
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
@@ -3025,6 +3096,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
break;
}
+ // The inter modes' rate costs are not calculated precisely in some cases.
+ // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+ // ZEROMV. Here, checks are added for those cases, and the mode decisions
+ // are corrected.
+ if (best_mbmode.mode == NEWMV) {
+ const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+ best_mbmode.ref_frame[1]};
+ int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+ if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int) || !comp_pred_mode))
+ best_mbmode.mode = NEARESTMV;
+ else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int) || !comp_pred_mode))
+ best_mbmode.mode = NEARMV;
+ else if (best_mbmode.mv[0].as_int == 0 &&
+ ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+ best_mbmode.mode = ZEROMV;
+ }
+
if (best_mode_index < 0 || best_rd >= best_rd_so_far)
return INT64_MAX;
@@ -3082,8 +3175,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
- store_coding_context(x, ctx, best_mode_index,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+ best_tx_diff, best_filter_diff, best_mode_skippable);
return best_rd;
}
@@ -3188,7 +3281,7 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
store_coding_context(x, ctx, THR_ZEROMV,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ best_pred_diff, best_tx_diff, best_filter_diff, 0);
return this_rd;
}
@@ -3325,6 +3418,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
+ if (!cm->allow_comp_inter_inter)
+ continue;
+
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
// Do not allow compound prediction if the segment level reference frame
@@ -3793,7 +3889,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
store_coding_context(x, ctx, best_ref_index,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ best_pred_diff, best_tx_diff, best_filter_diff, 0);
return best_rd;
}
diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.c b/source/libvpx/vp9/encoder/vp9_speed_features.c
index 57835ec..dbf4ae9 100644
--- a/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -65,7 +65,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
const int boosted = frame_is_boosted(cpi);
sf->adaptive_rd_thresh = 1;
- sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
sf->allow_skip_recode = 1;
if (speed >= 1) {
@@ -92,6 +91,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->tx_size_search_breakout = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 21);
+ sf->partition_search_breakout_rate_thr = 500;
}
if (speed >= 2) {
@@ -120,6 +125,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
sf->adjust_partitioning_from_last_frame = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 22);
+ sf->partition_search_breakout_rate_thr = 700;
}
if (speed >= 3) {
@@ -132,17 +143,25 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
}
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
sf->motion_field_mode_search = !boosted;
sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
- sf->last_partitioning_redo_frequency = 3;
+ sf->last_partitioning_redo_frequency = 2;
sf->recode_loop = ALLOW_RECODE_KFMAXBW;
sf->adaptive_rd_thresh = 3;
sf->mode_skip_start = 6;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+ sf->adaptive_interp_filter_search = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->partition_search_breakout_rate_thr = 1000;
}
if (speed >= 4) {
@@ -157,6 +176,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->use_lp32x32fdct = 1;
sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 26);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ sf->partition_search_breakout_rate_thr = 1500;
}
if (speed >= 5) {
@@ -180,8 +205,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
int speed, vp9e_tune_content content) {
VP9_COMMON *const cm = &cpi->common;
- const int frames_since_key =
- cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
+ const int is_keyframe = cm->frame_type == KEY_FRAME;
+ const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
sf->static_segmentation = 0;
sf->adaptive_rd_thresh = 1;
sf->use_fast_coef_costing = 1;
@@ -277,17 +302,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
}
if (speed >= 5) {
- sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
- sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
- RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+ sf->use_quant_fp = !is_keyframe;
+ sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+ : STRICT_NEIGHBORING_MIN_MAX;
sf->max_partition_size = BLOCK_32X32;
sf->min_partition_size = BLOCK_8X8;
sf->partition_check =
(frames_since_key % sf->last_partitioning_redo_frequency == 1);
- sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
- (frames_since_key %
- (sf->last_partitioning_redo_frequency << 1) == 1);
- sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+ sf->force_frame_boost = is_keyframe ||
+ (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+ sf->max_delta_qindex = is_keyframe ? 20 : 15;
sf->partition_search_type = REFERENCE_PARTITION;
sf->use_nonrd_pick_mode = 1;
sf->allow_skip_recode = 0;
@@ -305,8 +329,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
sf->search_type_check_frequency = 50;
- sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
- USE_LARGESTALL : USE_TX_8X8;
+ sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
// This feature is only enabled when partition search is disabled.
sf->reuse_inter_pred_sby = 1;
@@ -316,6 +339,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->mv.reduce_first_step_size = 1;
}
+
if (speed >= 7) {
sf->mv.search_method = FAST_DIAMOND;
sf->mv.fullpel_search_step_param = 10;
@@ -324,10 +348,12 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
800 : 300;
sf->elevate_newmv_thresh = 2500;
}
+
if (speed >= 12) {
sf->elevate_newmv_thresh = 4000;
sf->mv.subpel_force_stop = 2;
}
+
if (speed >= 13) {
int i;
sf->max_intra_bsize = BLOCK_32X32;
@@ -360,6 +386,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
sf->motion_field_mode_search = 0;
@@ -380,6 +407,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->force_frame_boost = 0;
sf->max_delta_qindex = 0;
sf->disable_filter_search_var_thresh = 0;
+ sf->adaptive_interp_filter_search = 0;
+
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
sf->intra_uv_mode_mask[i] = INTRA_ALL;
@@ -407,17 +436,17 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->recode_tolerance = 25;
sf->default_interp_filter = SWITCHABLE;
sf->tx_size_search_breakout = 0;
+ sf->partition_search_breakout_dist_thr = 0;
+ sf->partition_search_breakout_rate_thr = 0;
- if (oxcf->mode == REALTIME) {
+ if (oxcf->mode == REALTIME)
set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
- } else {
- if (!is_best_mode(oxcf->mode))
- set_good_speed_feature(cpi, cm, sf, oxcf->speed);
- }
+ else if (oxcf->mode == GOOD)
+ set_good_speed_feature(cpi, cm, sf, oxcf->speed);
cpi->full_search_sad = vp9_full_search_sad;
- cpi->diamond_search_sad = is_best_mode(oxcf->mode) ? vp9_full_range_search
- : vp9_diamond_search_sad;
+ cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
+ : vp9_diamond_search_sad;
cpi->refining_search_sad = vp9_refining_search_sad;
@@ -434,6 +463,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
if (sf->mv.subpel_search_method == SUBPEL_TREE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
}
cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.h b/source/libvpx/vp9/encoder/vp9_speed_features.h
index bad956d..33c441f 100644
--- a/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -40,6 +40,7 @@ typedef enum {
typedef enum {
SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1,
// Other methods to come
} SUBPEL_SEARCH_METHODS;
@@ -103,6 +104,12 @@ typedef enum {
} MODE_SEARCH_SKIP_LOGIC;
typedef enum {
+ FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+ FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+ FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
// Search partitions using RD/NONRD criterion
SEARCH_PARTITION = 0,
@@ -284,6 +291,9 @@ typedef struct SPEED_FEATURES {
// was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
int adaptive_pred_interp_filter;
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
@@ -380,6 +390,16 @@ typedef struct SPEED_FEATURES {
// Early termination in transform size search, which only applies while
// tx_size_search_method is USE_FULL_RD.
int tx_size_search_breakout;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // mask for skip evaluation of certain interp_filter type.
+ INTERP_FILTER_MASK interp_filter_search_mask;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/source/libvpx/vp9/encoder/vp9_ssim.c b/source/libvpx/vp9/encoder/vp9_ssim.c
index 026e6a8..8435640 100644
--- a/source/libvpx/vp9/encoder/vp9_ssim.c
+++ b/source/libvpx/vp9/encoder/vp9_ssim.c
@@ -95,7 +95,7 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
return ssim_total;
}
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- int lumamask, double *weight) {
+ double *weight) {
double a, b, c;
double ssimv;
diff --git a/source/libvpx/vp9/encoder/vp9_ssim.h b/source/libvpx/vp9/encoder/vp9_ssim.h
index a581c2c..d1dd1b7 100644
--- a/source/libvpx/vp9/encoder/vp9_ssim.h
+++ b/source/libvpx/vp9/encoder/vp9_ssim.h
@@ -18,7 +18,7 @@ extern "C" {
#include "vpx_scale/yv12config.h"
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- int lumamask, double *weight);
+ double *weight);
double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *ssim_y, double *ssim_u, double *ssim_v);
diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index fb52d1a..7545d87 100644
--- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -19,12 +19,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
int layer;
int layer_end;
- int alt_ref_idx = svc->number_spatial_layers;
+ int alt_ref_idx = svc->number_spatial_layers * svc->number_temporal_layers;
svc->spatial_layer_id = 0;
svc->temporal_layer_id = 0;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
layer_end = svc->number_temporal_layers;
} else {
layer_end = svc->number_spatial_layers;
@@ -36,6 +36,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
int i;
lc->current_video_frame_in_layer = 0;
lc->layer_size = 0;
+ lc->frames_from_key_frame = 0;
+ lc->last_frame_type = FRAME_TYPES;
lrc->ni_av_qi = oxcf->worst_allowed_q;
lrc->total_actual_bits = 0;
lrc->total_target_vs_actual = 0;
@@ -50,7 +52,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
lrc->rate_correction_factors[i] = 1.0;
}
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
@@ -75,7 +77,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
}
// Still have extra buffer for base layer golden frame
- if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+ if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+ && alt_ref_idx < REF_FRAMES)
svc->layer_context[0].gold_ref_idx = alt_ref_idx;
}
@@ -89,7 +92,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
int layer_end;
float bitrate_alloc = 1.0;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
layer_end = svc->number_temporal_layers;
} else {
layer_end = svc->number_spatial_layers;
@@ -99,7 +102,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
LAYER_CONTEXT *const lc = &svc->layer_context[layer];
RATE_CONTROL *const lrc = &lc->rc;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
} else {
lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
@@ -115,7 +118,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
// Update framerate-related quantities.
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
} else {
lc->framerate = cpi->framerate;
@@ -128,16 +131,16 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
}
}
-static LAYER_CONTEXT *get_layer_context(SVC *svc) {
- return svc->number_temporal_layers > 1 ?
- &svc->layer_context[svc->temporal_layer_id] :
- &svc->layer_context[svc->spatial_layer_id];
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+ return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+ &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
}
void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
SVC *const svc = &cpi->svc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
RATE_CONTROL *const lrc = &lc->rc;
const int layer = svc->temporal_layer_id;
@@ -159,7 +162,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
RATE_CONTROL *const lrc = &lc->rc;
lc->framerate = framerate;
@@ -172,7 +175,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
}
void vp9_restore_layer_context(VP9_COMP *const cpi) {
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
const int old_frame_since_key = cpi->rc.frames_since_key;
const int old_frame_to_key = cpi->rc.frames_to_key;
@@ -190,7 +193,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
void vp9_save_layer_context(VP9_COMP *const cpi) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
lc->rc = cpi->rc;
lc->twopass = cpi->twopass;
@@ -214,15 +217,17 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
svc->spatial_layer_id = 0;
}
-void vp9_inc_frame_in_layer(SVC *svc) {
- LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
- ? &svc->layer_context[svc->temporal_layer_id]
- : &svc->layer_context[svc->spatial_layer_id];
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+ LAYER_CONTEXT *const lc =
+ (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+ &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
++lc->current_video_frame_in_layer;
+ ++lc->frames_from_key_frame;
}
int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
- return is_spatial_svc(cpi) &&
+ return is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id > 0 &&
cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
}
@@ -257,6 +262,7 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
int layer_id;
vpx_svc_parameters_t *layer_param;
LAYER_CONTEXT *lc;
+ int count = 1 << (cpi->svc.number_temporal_layers - 1);
// Find the next layer to be encoded
for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
@@ -274,17 +280,36 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
- cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+ cpi->svc.temporal_layer_id = 0;
+ while ((lc->current_video_frame_in_layer % count) != 0) {
+ ++cpi->svc.temporal_layer_id;
+ count >>= 1;
+ }
+
+ cpi->lst_fb_idx =
+ cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id;
+ if (lc->frames_from_key_frame < cpi->svc.number_temporal_layers)
+ cpi->ref_frame_flags &= ~VP9_LAST_FLAG;
- if (cpi->svc.spatial_layer_id < 1)
+ if (cpi->svc.spatial_layer_id == 0) {
+ if (cpi->svc.temporal_layer_id == 0)
cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
lc->gold_ref_idx : cpi->lst_fb_idx;
- else
- cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
+ else
+ cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+ } else {
+ if (cpi->svc.temporal_layer_id == 0)
+ cpi->gld_fb_idx = cpi->svc.spatial_layer_id -
+ cpi->svc.number_temporal_layers;
+ else
+ cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+ }
if (lc->current_video_frame_in_layer == 0) {
if (cpi->svc.spatial_layer_id >= 2) {
- cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+ cpi->alt_fb_idx =
+ cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
} else {
cpi->alt_fb_idx = cpi->lst_fb_idx;
cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
@@ -306,7 +331,8 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
lc_lower->alt_ref_source != NULL)
cpi->alt_fb_idx = lc_lower->alt_ref_idx;
else if (cpi->svc.spatial_layer_id >= 2)
- cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+ cpi->alt_fb_idx =
+ cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
else
cpi->alt_fb_idx = cpi->lst_fb_idx;
}
@@ -325,7 +351,7 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
vp9_set_high_precision_mv(cpi, 1);
- cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source;
+ cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
return 0;
}
diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
index 801449b..1fc43a4 100644
--- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -25,9 +25,11 @@ typedef struct {
double framerate;
int avg_frame_size;
TWO_PASS twopass;
- struct vpx_fixed_buf rc_twopass_stats_in;
+ vpx_fixed_buf_t rc_twopass_stats_in;
unsigned int current_video_frame_in_layer;
int is_key_frame;
+ int frames_from_key_frame;
+ FRAME_TYPE last_frame_type;
vpx_svc_parameters_t svc_params_received;
struct lookahead_entry *alt_ref_source;
int alt_ref_idx;
@@ -80,7 +82,7 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
// Increment number of video frames in layer
-void vp9_inc_frame_in_layer(SVC *svc);
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
// Check if current layer is key frame in spatial upper layer
int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
diff --git a/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 076d776..18a6a91 100644
--- a/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -145,6 +145,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
int bestsme = INT_MAX;
int distortion;
unsigned int sse;
+ int sad_list[5];
MV best_ref_mv1 = {0, 0};
MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -168,6 +169,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
// Ignore mv costing by sending NULL pointer instead of cost arrays
vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+ cond_sad_list(cpi, sad_list),
&cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
// Ignore mv costing by sending NULL pointer instead of cost array
@@ -177,6 +179,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
0, mv_sf->subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
NULL, NULL,
&distortion, &sse, NULL, 0, 0);
@@ -188,6 +191,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
}
static void temporal_filter_iterate_c(VP9_COMP *cpi,
+ YV12_BUFFER_CONFIG **frames,
int frame_count,
int alt_ref_index,
int strength,
@@ -203,7 +207,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
- YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
@@ -247,7 +251,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
const int thresh_low = 10000;
const int thresh_high = 20000;
- if (cpi->frames[frame] == NULL)
+ if (frames[frame] == NULL)
continue;
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
@@ -258,9 +262,9 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
} else {
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(cpi,
- cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_stride);
+ frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->y_stride);
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
@@ -272,10 +276,10 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
if (filter_weight != 0) {
// Construct the predictors
temporal_filter_predictors_mb_c(mbd,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->u_buffer + mb_uv_offset,
- cpi->frames[frame]->v_buffer + mb_uv_offset,
- cpi->frames[frame]->y_stride,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset,
+ frames[frame]->y_stride,
mb_uv_width, mb_uv_height,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
@@ -429,6 +433,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
int frames_to_blur_backward;
int frames_to_blur_forward;
struct scale_factors sf;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
// Apply context specific adjustments to the arnr filter parameters.
adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
@@ -437,16 +442,15 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
start_frame = distance + frames_to_blur_forward;
// Setup frame pointers, NULL indicates frame not included in filter.
- vp9_zero(cpi->frames);
for (frame = 0; frame < frames_to_blur; ++frame) {
const int which_buffer = start_frame - frame;
struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
which_buffer);
- cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+ frames[frames_to_blur - 1 - frame] = &buf->img;
}
// Setup scaling factors. Scaling on each of the arnr frames is not supported
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
// In spatial svc the scaling factors might be less then 1/2. So we will use
// non-normative scaling.
int frame_used = 0;
@@ -457,19 +461,21 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
get_frame_new_buffer(cm)->y_crop_height);
for (frame = 0; frame < frames_to_blur; ++frame) {
- if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width ||
- cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) {
+ if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+ cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
- cpi->frames[frame] =
- vp9_scale_if_required(cm, cpi->frames[frame],
- &cpi->svc.scaled_frames[frame_used]);
+ frames[frame] = vp9_scale_if_required(cm, frames[frame],
+ &cpi->svc.scaled_frames[frame_used]);
++frame_used;
}
}
@@ -480,6 +486,6 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
cm->width, cm->height);
}
- temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
- strength, &sf);
+ temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+ frames_to_blur_backward, strength, &sf);
}
diff --git a/source/libvpx/vp9/encoder/vp9_variance.c b/source/libvpx/vp9/encoder/vp9_variance.c
index eb5ae2e..afbb191 100644
--- a/source/libvpx/vp9/encoder/vp9_variance.c
+++ b/source/libvpx/vp9/encoder/vp9_variance.c
@@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
- for (i = 0; i < 256; i++)
+ for (i = 0; i < 256; ++i) {
sum += src_ptr[i] * src_ptr[i];
+ }
return sum;
}
diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index b6bcdd9..e799951 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,6 +12,8 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
__m128i in0, in1;
__m128i tmp;
@@ -780,58 +782,6 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
_mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
}
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
void fdct8_sse2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -1953,23 +1903,6 @@ static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
}
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
// perform rounding operations
right_shift_8x8(res0, 2);
diff --git a/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm b/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm
deleted file mode 100644
index 32fdd23..0000000
--- a/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp9_sad16x16_mmx) PRIVATE
-global sym(vp9_sad8x16_mmx) PRIVATE
-global sym(vp9_sad8x8_mmx) PRIVATE
-global sym(vp9_sad4x4_mmx) PRIVATE
-global sym(vp9_sad16x8_mmx) PRIVATE
-
-;unsigned int vp9_sad16x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpcklbw mm2, mm6
-
- punpckhbw mm1, mm6
- punpckhbw mm3, mm6
-
- paddw mm0, mm2
- paddw mm1, mm3
-
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- paddw mm7, mm1
-
- cmp rsi, rcx
- jne .x16x16sad_mmx_loop
-
-
- movq mm0, mm7
-
- punpcklwd mm0, mm6
- punpckhwd mm7, mm6
-
- paddw mm0, mm7
- movq mm7, mm0
-
-
- psrlq mm0, 32
- paddw mm7, mm0
-
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- paddw mm7, mm2
- cmp rsi, rcx
-
- jne .x8x16sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x8sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- paddw mm0, mm2
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- cmp rsi, rcx
-
- jne .x8x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad4x4_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- pxor mm3, mm3
-
- punpcklbw mm0, mm3
- punpckhbw mm2, mm3
-
- paddw mm0, mm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movd mm4, DWORD PTR [rsi]
- movd mm5, DWORD PTR [rdi]
-
- movd mm6, DWORD PTR [rsi+rax]
- movd mm7, DWORD PTR [rdi+rdx]
-
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
-
- movq mm6, mm4
- psubusb mm4, mm5
-
- psubusb mm5, mm6
- por mm4, mm5
-
- movq mm5, mm4
- punpcklbw mm4, mm3
-
- punpckhbw mm5, mm3
- paddw mm4, mm5
-
- paddw mm0, mm4
- movq mm1, mm0
-
- punpcklwd mm0, mm3
- punpckhwd mm1, mm3
-
- paddw mm0, mm1
- movq mm1, mm0
-
- psrlq mm0, 32
- paddw mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad16x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x8sad_mmx_loop:
-
- movq mm0, [rsi]
- movq mm1, [rdi]
-
- movq mm2, [rsi+8]
- movq mm3, [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpckhbw mm1, mm6
-
- punpcklbw mm2, mm6
- punpckhbw mm3, mm6
-
-
- paddw mm0, mm2
- paddw mm1, mm3
-
- paddw mm0, mm1
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x16x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
index 7f81f46..ea09b95 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
@@ -12,67 +12,39 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-typedef void (*get_var_avx2) (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-
-void vp9_get16x16var_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-
-void vp9_get32x32var_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-
-unsigned int vp9_sub_pixel_variance32xh_avx2
-(
- const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- int height,
- unsigned int *sse
-);
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2
-(
- const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sseptr
-);
-
-static void variance_avx2(const unsigned char *src_ptr, int source_stride,
- const unsigned char *ref_ptr, int recon_stride,
- int w, int h, unsigned int *sse, int *sum,
- get_var_avx2 var_fn, int block_size) {
- unsigned int sse0;
- int sum0;
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height,
+ unsigned int *sse);
+
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ const uint8_t *sec,
+ int sec_stride,
+ int height,
+ unsigned int *sseptr);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ get_var_avx2 var_fn, int block_size) {
int i, j;
*sse = 0;
@@ -80,105 +52,68 @@ static void variance_avx2(const unsigned char *src_ptr, int source_stride,
for (i = 0; i < h; i += 16) {
for (j = 0; j < w; j += block_size) {
- // processing 16 rows horizontally each call
- var_fn(src_ptr + source_stride * i + j, source_stride,
- ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+ unsigned int sse0;
+ int sum0;
+ var_fn(&src[src_stride * i + j], src_stride,
+ &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
-unsigned int vp9_variance16x16_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
- &var, &avg, vp9_get16x16var_avx2, 16);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
+unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_get16x16var_avx2, 16);
+ return *sse - (((unsigned int)sum * sum) >> 8);
}
-unsigned int vp9_mse16x16_avx2(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0;
- int sum0;
- vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- *sse = sse0;
- return sse0;
+unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse;
}
-unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 10));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 9);
}
-unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 9));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 10);
}
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 12));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 12);
}
-unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
- &var, &avg, vp9_get32x32var_avx2, 32);
-
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 11));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
@@ -187,22 +122,19 @@ unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sse_ptr) {
- // processing 32 elements in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 64, &sse);
- // processing the next 32 elements in parallel
+ unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 64, &sse1);
unsigned int sse2;
- int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
- x_offset, y_offset,
- dst + 32, dst_stride,
- 64, &sse2);
- se += se2;
- sse += sse2;
- *sse_ptr = sse;
- return sse - (((int64_t)se * se) >> 12);
+ const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+ x_offset, y_offset,
+ dst + 32, dst_stride,
+ 64, &sse2);
+ const int se = se1 + se2;
+ *sse = sse1 + sse2;
+ return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -211,14 +143,11 @@ unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sse_ptr) {
- // processing 32 element in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 32, &sse);
- *sse_ptr = sse;
- return sse - (((int64_t)se * se) >> 10);
+ unsigned int *sse) {
+ const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
}
unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
@@ -227,24 +156,22 @@ unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sseptr,
+ unsigned int *sse,
const uint8_t *sec) {
- // processing 32 elements in parallel
- unsigned int sse;
-
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 64, 64, &sse);
+ unsigned int sse1;
+ const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 64, 64, &sse1);
unsigned int sse2;
- // processing the next 32 elements in parallel
- int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
- y_offset, dst + 32, dst_stride,
- sec + 32, 64, 64, &sse2);
- se += se2;
- sse += sse2;
- *sseptr = sse;
+ const int se2 =
+ vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+ y_offset, dst + 32, dst_stride,
+ sec + 32, 64, 64, &sse2);
+ const int se = se1 + se2;
- return sse - (((int64_t)se * se) >> 12);
+ *sse = sse1 + sse2;
+
+ return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
@@ -253,15 +180,11 @@ unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sseptr,
+ unsigned int *sse,
const uint8_t *sec) {
// processing 32 element in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 32, 32, &sse);
- *sseptr = sse;
- return sse - (((int64_t)se * se) >> 10);
+ const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 32, 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
}
-
-
diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
deleted file mode 100644
index 3501cf1..0000000
--- a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ /dev/null
@@ -1,510 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx) PRIVATE
-sym(vp9_get_mb_ss_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 8
- ; end prolog
-
- mov rax, arg(0) ;src_ptr
- mov rcx, 16
- pxor mm4, mm4
-
-.NEXTROW:
- movq mm0, [rax]
- movq mm1, [rax+8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
-
- paddd mm4, mm0
- paddd mm4, mm1
- paddd mm4, mm2
- paddd mm4, mm3
-
- add rax, 32
- dec rcx
- ja .NEXTROW
- movq QWORD PTR [rsp], mm4
-
- ;return sum[0]+sum[1];
- movsxd rax, dword ptr [rsp]
- movsxd rcx, dword ptr [rsp+4]
- add rax, rcx
-
-
- ; begin epilog
- add rsp, 8
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get8x8var_mmx) PRIVATE
-sym(vp9_get8x8var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 5
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- ; movq mm4, [rbx + rdx]
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 6
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 7
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 8
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get4x4var_mmx) PRIVATE
-sym(vp9_get4x4var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movd mm0, [rax] ; Copy 4 bytes to mm0
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Row 2
- movd mm0, [rax] ; Copy 4 bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy 4 bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy 4 bytes to mm0
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx) PRIVATE
-sym(vp9_get4x4sse_cs_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- push rbx
- ; end prolog
-
-
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
- ; Row 1
- movd mm0, [rax] ; Copy eight bytes to mm0
- movd mm1, [rbx] ; Copy eight bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 2
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm1, mm6
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
- movq mm0, mm7 ;
- psrlq mm7, 32
-
- paddd mm0, mm7
- movq rax, mm0
-
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
deleted file mode 100644
index 4830412..0000000
--- a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ /dev/null
@@ -1,401 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-; short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2) PRIVATE
-sym(vp9_get_mb_ss_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 1
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- mov rax, arg(0) ;[src_ptr]
- mov rcx, 8
- pxor xmm4, xmm4
-
-.NEXTROW:
- movdqa xmm0, [rax]
- movdqa xmm1, [rax+16]
- movdqa xmm2, [rax+32]
- movdqa xmm3, [rax+48]
- pmaddwd xmm0, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
- paddd xmm4, xmm0
- paddd xmm4, xmm2
-
- add rax, 0x40
- dec rcx
- ja .NEXTROW
-
- movdqa xmm3,xmm4
- psrldq xmm4,8
- paddd xmm4,xmm3
- movdqa xmm3,xmm4
- psrldq xmm4,4
- paddd xmm4,xmm3
- movq rax,xmm4
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get16x16var_sse2) PRIVATE
-sym(vp9_get16x16var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- ; Prefetch data
- lea rcx, [rax+rax*2]
- prefetcht0 [rsi]
- prefetcht0 [rsi+rax]
- prefetcht0 [rsi+rax*2]
- prefetcht0 [rsi+rcx]
- lea rbx, [rsi+rax*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax*2]
- prefetcht0 [rbx+rcx]
-
- lea rcx, [rdx+rdx*2]
- prefetcht0 [rdi]
- prefetcht0 [rdi+rdx]
- prefetcht0 [rdi+rdx*2]
- prefetcht0 [rdi+rcx]
- lea rbx, [rdi+rdx*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx*2]
- prefetcht0 [rbx+rcx]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-.var16loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- prefetcht0 [rsi+rax*8]
- prefetcht0 [rdi+rdx*8]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz .var16loop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movd DWORD PTR [rax], xmm7
- movd DWORD PTR [rdi], xmm1
-
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
deleted file mode 100644
index ce1c832..0000000
--- a/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
-unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *SSE, int *sum);
-
-unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3;
- int sum0, sum1, sum2, sum3;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
- ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
- *sse = sse0 + sse1 + sse2 + sse3;
- return *sse;
-}
-
-
-unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3;
- int sum0, sum1, sum2, sum3, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
- ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
- *sse = sse0 + sse1 + sse2 + sse3;
- sum = sum0 + sum1 + sum2 + sum3;
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1;
- int sum0, sum1, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-
- *sse = sse0 + sse1;
- sum = sum0 + sum1;
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-
-unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1;
- int sum0, sum1, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
-
- *sse = sse0 + sse1;
- sum = sum0 + sum1;
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index e935a23..b4d2b0a 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
@@ -17,18 +19,137 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
-unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return _mm_cvtsi128_si32(vsum);
+}
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+#define READ64(p, stride, i) \
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+ const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ // sum
+ __m128i vsum = _mm_add_epi16(diff0, diff1);
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+ _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ *sse = _mm_cvtsi128_si32(vsum);
+
+ return 0;
+}
+
+unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
+
+unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+ (int16_t)_mm_extract_epi16(vsum, 1);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
-unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
static void variance_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
@@ -55,8 +176,7 @@ unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 4);
}
@@ -65,7 +185,7 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ sse, &sum, vp9_get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
@@ -74,7 +194,7 @@ unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ sse, &sum, vp9_get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
@@ -82,8 +202,7 @@ unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_get8x8var_sse2, 8);
+ vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
@@ -109,17 +228,8 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse;
+ return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -176,6 +286,34 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
return *sse - (((int64_t)sum * sum) >> 11);
}
+unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
diff --git a/source/libvpx/vp9/vp9_common.mk b/source/libvpx/vp9/vp9_common.mk
index 8e3e885..90f0342 100644
--- a/source/libvpx/vp9/vp9_common.mk
+++ b/source/libvpx/vp9/vp9_common.mk
@@ -80,7 +80,6 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
diff --git a/source/libvpx/vp9/vp9_cx_iface.c b/source/libvpx/vp9/vp9_cx_iface.c
index 1716053..0f0b7a5 100644
--- a/source/libvpx/vp9/vp9_cx_iface.c
+++ b/source/libvpx/vp9/vp9_cx_iface.c
@@ -21,7 +21,6 @@
#include "vp9/vp9_iface_common.h"
struct vp9_extracfg {
- struct vpx_codec_pkt_list *pkt_list;
int cpu_used; // available cpu percentage in 1/16
unsigned int enable_auto_alt_ref;
unsigned int noise_sensitivity;
@@ -31,7 +30,6 @@ struct vp9_extracfg {
unsigned int tile_rows;
unsigned int arnr_max_frames;
unsigned int arnr_strength;
- unsigned int arnr_type;
vp8e_tuning tuning;
unsigned int cq_level; // constrained quality level
unsigned int rc_max_intra_bitrate_pct;
@@ -39,41 +37,29 @@ struct vp9_extracfg {
unsigned int frame_parallel_decoding_mode;
AQ_MODE aq_mode;
unsigned int frame_periodic_boost;
- BIT_DEPTH bit_depth;
+ vpx_bit_depth_t bit_depth;
vp9e_tune_content content;
};
-struct extraconfig_map {
- unsigned int usage;
- struct vp9_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
- {
- 0,
- { // NOLINT
- NULL,
- 0, // cpu_used
- 1, // enable_auto_alt_ref
- 0, // noise_sensitivity
- 0, // sharpness
- 0, // static_thresh
- 0, // tile_columns
- 0, // tile_rows
- 7, // arnr_max_frames
- 5, // arnr_strength
- 3, // arnr_type
- VP8_TUNE_PSNR, // tuning
- 10, // cq_level
- 0, // rc_max_intra_bitrate_pct
- 0, // lossless
- 0, // frame_parallel_decoding_mode
- NO_AQ, // aq_mode
- 0, // frame_periodic_delta_q
- BITS_8, // Bit depth
- VP9E_CONTENT_DEFAULT // content
- }
- }
+static struct vp9_extracfg default_extra_cfg = {
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // noise_sensitivity
+ 0, // sharpness
+ 0, // static_thresh
+ 0, // tile_columns
+ 0, // tile_rows
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ VP8_TUNE_PSNR, // tuning
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // lossless
+ 0, // frame_parallel_decoding_mode
+ NO_AQ, // aq_mode
+ 0, // frame_periodic_delta_q
+ VPX_BITS_8, // Bit depth
+ VP9E_CONTENT_DEFAULT // content
};
struct vpx_codec_alg_priv {
@@ -177,20 +163,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
}
RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
-
-#if CONFIG_SPATIAL_SVC
- if (cfg->ss_number_layers > 1) {
- unsigned int i, alt_ref_sum = 0;
- for (i = 0; i < cfg->ss_number_layers; ++i) {
- if (cfg->ss_enable_auto_alt_ref[i])
- ++alt_ref_sum;
- }
- if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
- ERROR("Not enough ref buffers for svc alt ref frames");
- }
-#endif
-
RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+
if (cfg->ts_number_layers > 1) {
unsigned int i;
for (i = 1; i < cfg->ts_number_layers; ++i)
@@ -203,6 +177,28 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
ERROR("ts_rate_decimator factors are not powers of 2");
}
+#if CONFIG_SPATIAL_SVC
+ if (cfg->ss_number_layers * cfg->ts_number_layers > REF_FRAMES)
+ ERROR("Too many layers. Maximum 8 layers could be set");
+
+ if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
+ cfg->g_pass == VPX_RC_LAST_PASS) {
+ unsigned int i, alt_ref_sum = 0;
+ for (i = 0; i < cfg->ss_number_layers; ++i) {
+ if (cfg->ss_enable_auto_alt_ref[i])
+ ++alt_ref_sum;
+ }
+ if (alt_ref_sum >
+ REF_FRAMES - cfg->ss_number_layers * cfg->ts_number_layers)
+ ERROR("Not enough ref buffers for svc alt ref frames");
+ if ((cfg->ss_number_layers > 3 ||
+ cfg->ss_number_layers * cfg->ts_number_layers > 4) &&
+ cfg->g_error_resilient == 0)
+ ERROR("Multiple frame context are not supported for more than 3 spatial "
+ "layers or more than 4 spatial x temporal layers");
+ }
+#endif
+
// VP9 does not support a lower bound on the keyframe interval in
// automatic keyframe placement mode.
if (cfg->kf_mode != VPX_KF_DISABLED &&
@@ -219,8 +215,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, sharpness, 7);
RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
- RANGE_CHECK(extra_cfg, arnr_type, 1, 3);
RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+ RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+ RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
RANGE_CHECK(extra_cfg, content,
VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
@@ -239,7 +236,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
if (cfg->rc_twopass_stats_in.sz % packet_sz)
ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
- if (cfg->ss_number_layers > 1) {
+ if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) {
int i;
unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0};
@@ -279,12 +276,16 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
}
}
+#if !CONFIG_VP9_HIGHBITDEPTH
+ if (cfg->g_profile > (unsigned int)PROFILE_1)
+ ERROR("Profile > 1 not supported in this build configuration");
+#endif
if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth > BITS_8)
- ERROR("High bit-depth not supported in profile < 2");
+ extra_cfg->bit_depth > VPX_BITS_8)
+ ERROR("Codec high bit-depth not supported in profile < 2");
if (cfg->g_profile > (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth == BITS_8)
- ERROR("Bit-depth 8 not supported in profile > 1");
+ extra_cfg->bit_depth == VPX_BITS_8)
+ ERROR("Codec bit-depth 8 not supported in profile > 1");
return VPX_CODEC_OK;
}
@@ -316,6 +317,9 @@ static int get_image_bps(const vpx_image_t *img) {
case VPX_IMG_FMT_I420: return 12;
case VPX_IMG_FMT_I422: return 16;
case VPX_IMG_FMT_I444: return 24;
+ case VPX_IMG_FMT_I42016: return 24;
+ case VPX_IMG_FMT_I42216: return 32;
+ case VPX_IMG_FMT_I44416: return 48;
default: assert(0 && "Invalid image format"); break;
}
return 0;
@@ -330,12 +334,13 @@ static vpx_codec_err_t set_encoder_config(
oxcf->width = cfg->g_w;
oxcf->height = cfg->g_h;
oxcf->bit_depth = extra_cfg->bit_depth;
+ oxcf->input_bit_depth = cfg->g_input_bit_depth;
// guess a frame rate if out of whack, use 30
oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
if (oxcf->init_framerate > 180)
oxcf->init_framerate = 30;
- oxcf->mode = BEST;
+ oxcf->mode = GOOD;
switch (cfg->g_pass) {
case VPX_RC_ONE_PASS:
@@ -393,7 +398,6 @@ static vpx_codec_err_t set_encoder_config(
oxcf->sharpness = extra_cfg->sharpness;
oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
- oxcf->output_pkt_list = extra_cfg->pkt_list;
#if CONFIG_FP_MB_STATS
oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
@@ -401,7 +405,6 @@ static vpx_codec_err_t set_encoder_config(
oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
oxcf->arnr_strength = extra_cfg->arnr_strength;
- oxcf->arnr_type = extra_cfg->arnr_type;
oxcf->tuning = extra_cfg->tuning;
oxcf->content = extra_cfg->content;
@@ -428,6 +431,9 @@ static vpx_codec_err_t set_encoder_config(
}
} else if (oxcf->ss_number_layers == 1) {
oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+#if CONFIG_SPATIAL_SVC
+ oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+#endif
}
oxcf->ts_number_layers = cfg->ts_number_layers;
@@ -597,9 +603,9 @@ static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
va_list args) {
- struct vp9_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
- return update_extra_cfg(ctx, &extra_cfg);
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_OK;
}
static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
@@ -659,51 +665,32 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
(void)data;
if (ctx->priv == NULL) {
- int i;
- vpx_codec_enc_cfg_t *cfg;
- struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv));
-
+ vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
if (priv == NULL)
return VPX_CODEC_MEM_ERROR;
- ctx->priv = &priv->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->alg_priv = priv;
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
ctx->priv->enc.total_encoders = 1;
if (ctx->config.enc) {
// Update the reference to the config structure to an internal copy.
- ctx->priv->alg_priv->cfg = *ctx->config.enc;
- ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
}
- cfg = &ctx->priv->alg_priv->cfg;
-
- // Select the extra vp6 configuration table based on the current
- // usage value. If the current usage value isn't found, use the
- // values for usage case 0.
- for (i = 0;
- extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- ++i) {}
-
- priv->extra_cfg = extracfg_map[i].cfg;
- priv->extra_cfg.pkt_list = &priv->pkt_list.head;
-
+ priv->extra_cfg = default_extra_cfg;
vp9_initialize_enc();
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
if (res == VPX_CODEC_OK) {
- VP9_COMP *cpi;
- set_encoder_config(&ctx->priv->alg_priv->oxcf,
- &ctx->priv->alg_priv->cfg,
- &ctx->priv->alg_priv->extra_cfg);
- cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
- if (cpi == NULL)
+ set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+ priv->cpi = vp9_create_compressor(&priv->oxcf);
+ if (priv->cpi == NULL)
res = VPX_CODEC_MEM_ERROR;
else
- ctx->priv->alg_priv->cpi = cpi;
+ priv->cpi->output_pkt_list = &priv->pkt_list.head;
}
}
@@ -713,7 +700,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
free(ctx->cx_data);
vp9_remove_compressor(ctx->cpi);
- free(ctx);
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
@@ -825,6 +812,23 @@ static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+ unsigned int lib_flags) {
+ vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY
+#if CONFIG_SPATIAL_SVC
+ || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame)
+#endif
+ )
+ flags |= VPX_FRAME_IS_KEY;
+
+ if (cpi->droppable)
+ flags |= VPX_FRAME_IS_DROPPABLE;
+
+ return flags;
+}
+
static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
const vpx_image_t *img,
vpx_codec_pts_t pts,
@@ -832,18 +836,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
vpx_enc_frame_flags_t flags,
unsigned long deadline) {
vpx_codec_err_t res = VPX_CODEC_OK;
+ VP9_COMP *const cpi = ctx->cpi;
const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
if (img != NULL) {
res = validate_img(ctx, img);
// TODO(jzern) the checks related to cpi's validity should be treated as a
// failure condition, encoder setup is done fully in init() currently.
- if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+ if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
// There's no codec control for multiple alt-refs so check the encoder
// instance for its status to determine the compressed data size.
ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
get_image_bps(img) / 8 *
- (ctx->cpi->multi_arf_allowed ? 8 : 2);
+ (cpi->multi_arf_allowed ? 8 : 2);
if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
@@ -863,7 +868,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_INVALID_PARAM;
}
- vp9_apply_encoding_flags(ctx->cpi, flags);
+ vp9_apply_encoding_flags(cpi, flags);
// Handle fixed keyframe intervals
if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -875,7 +880,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
}
// Initialize the encoder instance on the first frame.
- if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
+ if (res == VPX_CODEC_OK && cpi != NULL) {
unsigned int lib_flags = 0;
YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -886,16 +891,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// Set up internal flags
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
- ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+ cpi->b_calculate_psnr = 1;
if (img != NULL) {
res = image2yuvconfig(img, &sd);
// Store the original flags in to the frame buffer. Will extract the
// key frame flag when we actually encode this frame.
- if (vp9_receive_raw_frame(ctx->cpi, flags,
+ if (vp9_receive_raw_frame(cpi, flags,
&sd, dst_time_stamp, dst_end_time_stamp)) {
- VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
res = update_error_state(ctx, &cpi->common.error);
}
}
@@ -920,22 +924,21 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
}
while (cx_data_sz >= ctx->cx_data_sz / 2 &&
- -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+ -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
cx_data, &dst_time_stamp,
&dst_end_time_stamp, !img)) {
if (size) {
- VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
vpx_codec_cx_pkt_t pkt;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
+ if (is_two_pass_svc(cpi))
cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size;
#endif
// Pack invisible frames with the next visible frame
- if (cpi->common.show_frame == 0
+ if (!cpi->common.show_frame
#if CONFIG_SPATIAL_SVC
- || (is_spatial_svc(cpi) &&
+ || (is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
#endif
) {
@@ -955,30 +958,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
pkt.data.frame.duration =
(unsigned long)ticks_to_timebase_units(timebase,
dst_end_time_stamp - dst_time_stamp);
- pkt.data.frame.flags = lib_flags << 16;
-
- if (lib_flags & FRAMEFLAGS_KEY
-#if CONFIG_SPATIAL_SVC
- || (is_spatial_svc(cpi) &&
- cpi->svc.layer_context[0].is_key_frame)
-#endif
- )
- pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
- if (cpi->common.show_frame == 0) {
- pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
- // This timestamp should be as close as possible to the
- // prior PTS so that if a decoder uses pts to schedule when
- // to do this, we start right after last frame was decoded.
- // Invisible frames have no duration.
- pkt.data.frame.pts =
- ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
- pkt.data.frame.duration = 0;
- }
-
- if (cpi->droppable)
- pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
if (ctx->pending_cx_data) {
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -1000,9 +980,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
cx_data += size;
cx_data_sz -= size;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi)) {
- vpx_codec_cx_pkt_t pkt = {0};
+ if (is_two_pass_svc(cpi)) {
+ vpx_codec_cx_pkt_t pkt;
int i;
+ vp9_zero(pkt);
pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
@@ -1289,6 +1270,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
320, // g_width
240, // g_height
+ VPX_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
{1, 30}, // g_timebase
0, // g_error_resilient
@@ -1354,11 +1338,11 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
encoder_destroy, // vpx_codec_destroy_fn_t
encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t
{ // NOLINT
- NOT_IMPLEMENTED, // vpx_codec_peek_si_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t
- NOT_IMPLEMENTED, // vpx_codec_decode_fn_t
- NOT_IMPLEMENTED, // vpx_codec_frame_get_fn_t
- NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t
+ NULL, // vpx_codec_peek_si_fn_t
+ NULL, // vpx_codec_get_si_fn_t
+ NULL, // vpx_codec_decode_fn_t
+ NULL, // vpx_codec_frame_get_fn_t
+ NULL // vpx_codec_set_fb_fn_t
},
{ // NOLINT
1, // 1 cfg map
@@ -1366,8 +1350,8 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
encoder_encode, // vpx_codec_encode_fn_t
encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t
encoder_set_config, // vpx_codec_enc_config_set_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t
+ NULL, // vpx_codec_get_global_headers_fn_t
encoder_get_preview, // vpx_codec_get_preview_frame_fn_t
- NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t
+ NULL // vpx_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/source/libvpx/vp9/vp9_dx_iface.c b/source/libvpx/vp9/vp9_dx_iface.c
index bb2bb10..393c66e 100644
--- a/source/libvpx/vp9/vp9_dx_iface.c
+++ b/source/libvpx/vp9/vp9_dx_iface.c
@@ -58,28 +58,22 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
(void)data;
if (!ctx->priv) {
- vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
- if (alg_priv == NULL)
+ vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+ if (priv == NULL)
return VPX_CODEC_MEM_ERROR;
- vp9_zero(*alg_priv);
-
- ctx->priv = (vpx_codec_priv_t *)alg_priv;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->alg_priv = alg_priv;
- ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
- ctx->priv->alg_priv->flushed = 0;
- ctx->priv->alg_priv->frame_parallel_decode =
- (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
- // Disable frame parallel decoding for now.
- ctx->priv->alg_priv->frame_parallel_decode = 0;
+ priv->si.sz = sizeof(priv->si);
+ priv->flushed = 0;
+ priv->frame_parallel_decode =
+ (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+ priv->frame_parallel_decode = 0; // Disable for now
if (ctx->config.dec) {
- // Update the reference to the config structure to an internal copy.
- ctx->priv->alg_priv->cfg = *ctx->config.dec;
- ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
}
}
@@ -443,6 +437,7 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
// call to get_frame.
if (!(*iter)) {
img = &ctx->img;
+ img->bit_depth = (int)ctx->pbi->common.bit_depth;
*iter = img;
}
}
@@ -591,6 +586,23 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
}
}
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const bit_depth = va_arg(args, unsigned int *);
+
+ if (bit_depth) {
+ if (ctx->pbi) {
+ const VP9_COMMON *const cm = &ctx->pbi->common;
+ *bit_depth = cm->bit_depth;
+ return VPX_CODEC_OK;
+ } else {
+ return VPX_CODEC_ERROR;
+ }
+ } else {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+}
+
static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
va_list args) {
ctx->invert_tile_order = va_arg(args, int);
@@ -623,6 +635,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted},
{VP9_GET_REFERENCE, ctrl_get_reference},
{VP9D_GET_DISPLAY_SIZE, ctrl_get_display_size},
+ {VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth},
{ -1, NULL},
};
@@ -647,12 +660,12 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
},
{ // NOLINT
0,
- NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t
- NOT_IMPLEMENTED, // vpx_codec_encode_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t
- NOT_IMPLEMENTED, // vpx_codec_enc_config_set_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_preview_frame_fn_t
- NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t
+ NULL, // vpx_codec_enc_cfg_map_t
+ NULL, // vpx_codec_encode_fn_t
+ NULL, // vpx_codec_get_cx_data_fn_t
+ NULL, // vpx_codec_enc_config_set_fn_t
+ NULL, // vpx_codec_get_global_headers_fn_t
+ NULL, // vpx_codec_get_preview_frame_fn_t
+ NULL // vpx_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/source/libvpx/vp9/vp9cx.mk b/source/libvpx/vp9/vp9cx.mk
index dc46c4e..e450f7b 100644
--- a/source/libvpx/vp9/vp9cx.mk
+++ b/source/libvpx/vp9/vp9cx.mk
@@ -93,10 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
diff --git a/source/libvpx/vpx/internal/vpx_codec_internal.h b/source/libvpx/vpx/internal/vpx_codec_internal.h
index 95119df..cbfffd0 100644
--- a/source/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/source/libvpx/vpx/internal/vpx_codec_internal.h
@@ -286,8 +286,6 @@ typedef const struct vpx_codec_enc_cfg_map {
vpx_codec_enc_cfg_t cfg;
} vpx_codec_enc_cfg_map_t;
-#define NOT_IMPLEMENTED 0
-
/*!\brief Decoder algorithm interface interface
*
* All decoders \ref MUST expose a variable of this type.
@@ -337,8 +335,6 @@ typedef struct vpx_codec_priv_cb_pair {
* and the pointer cast to the proper type.
*/
struct vpx_codec_priv {
- unsigned int sz;
- struct vpx_codec_alg_priv *alg_priv;
const char *err_detail;
vpx_codec_flags_t init_flags;
struct {
@@ -346,7 +342,7 @@ struct vpx_codec_priv {
vpx_codec_priv_cb_pair_t put_slice_cb;
} dec;
struct {
- struct vpx_fixed_buf cx_data_dst_buf;
+ vpx_fixed_buf_t cx_data_dst_buf;
unsigned int cx_data_pad_before;
unsigned int cx_data_pad_after;
vpx_codec_cx_pkt_t cx_data_pkt;
diff --git a/source/libvpx/vpx/src/svc_encodeframe.c b/source/libvpx/vpx/src/svc_encodeframe.c
index 7828615..8911e83 100644
--- a/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/source/libvpx/vpx/src/svc_encodeframe.c
@@ -86,6 +86,7 @@ typedef struct SvcInternal {
int layers;
int layer;
int is_keyframe;
+ int use_multiple_frame_contexts;
FrameData *frame_list;
FrameData *frame_temp;
@@ -366,6 +367,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
char *option_name;
char *option_value;
char *input_ptr;
+ SvcInternal *const si = get_svc_internal(svc_ctx);
vpx_codec_err_t res = VPX_CODEC_OK;
if (options == NULL) return VPX_CODEC_OK;
@@ -382,8 +384,10 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
res = VPX_CODEC_INVALID_PARAM;
break;
}
- if (strcmp("layers", option_name) == 0) {
+ if (strcmp("spatial-layers", option_name) == 0) {
svc_ctx->spatial_layers = atoi(option_value);
+ } else if (strcmp("temporal-layers", option_name) == 0) {
+ svc_ctx->temporal_layers = atoi(option_value);
} else if (strcmp("scale-factors", option_name) == 0) {
res = parse_scale_factors(svc_ctx, option_value);
if (res != VPX_CODEC_OK) break;
@@ -393,6 +397,8 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
} else if (strcmp("auto-alt-refs", option_name) == 0) {
res = parse_auto_alt_ref(svc_ctx, option_value);
if (res != VPX_CODEC_OK) break;
+ } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+ si->use_multiple_frame_contexts = atoi(option_value);
} else {
svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
res = VPX_CODEC_INVALID_PARAM;
@@ -401,6 +407,12 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
option_name = strtok_r(NULL, "=", &input_ptr);
}
free(input_string);
+
+ if (si->use_multiple_frame_contexts &&
+ (svc_ctx->spatial_layers > 3 ||
+ svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4))
+ res = VPX_CODEC_INVALID_PARAM;
+
return res;
}
@@ -480,6 +492,16 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
res = parse_options(svc_ctx, si->options);
if (res != VPX_CODEC_OK) return res;
+ if (svc_ctx->spatial_layers < 1)
+ svc_ctx->spatial_layers = 1;
+ if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS)
+ svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS;
+
+ if (svc_ctx->temporal_layers < 1)
+ svc_ctx->temporal_layers = 1;
+ if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
+ svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
+
si->layers = svc_ctx->spatial_layers;
// Assign target bitrate for each layer. We calculate the ratio
@@ -515,9 +537,18 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
#endif
+ if (svc_ctx->temporal_layers > 1) {
+ int i;
+ for (i = 0; i < svc_ctx->temporal_layers; ++i) {
+ enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate /
+ svc_ctx->temporal_layers;
+ enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i);
+ }
+ }
+
// modify encoder configuration
enc_cfg->ss_number_layers = si->layers;
- enc_cfg->ts_number_layers = 1; // Temporal layers not used in this encoder.
+ enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
// TODO(ivanmaltz): determine if these values need to be set explicitly for
// svc, or if the normal default/override mechanism can be used
@@ -534,7 +565,8 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
enc_cfg->rc_buf_initial_sz = 500;
enc_cfg->rc_buf_optimal_sz = 600;
enc_cfg->rc_buf_sz = 1000;
- enc_cfg->g_error_resilient = 1;
+ if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+ enc_cfg->g_error_resilient = 1;
// Initialize codec
res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
diff --git a/source/libvpx/vpx/src/vpx_codec.c b/source/libvpx/vpx/src/vpx_codec.c
index d175eae..5a495ce 100644
--- a/source/libvpx/vpx/src/vpx_codec.c
+++ b/source/libvpx/vpx/src/vpx_codec.c
@@ -88,8 +88,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
else {
- if (ctx->priv->alg_priv)
- ctx->iface->destroy(ctx->priv->alg_priv);
+ ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv);
ctx->iface = NULL;
ctx->name = NULL;
@@ -125,7 +124,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx,
va_list ap;
va_start(ap, ctrl_id);
- res = entry->fn(ctx->priv->alg_priv, ap);
+ res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
va_end(ap);
break;
}
diff --git a/source/libvpx/vpx/src/vpx_decoder.c b/source/libvpx/vpx/src/vpx_decoder.c
index b19c440..802d8ed 100644
--- a/source/libvpx/vpx/src/vpx_decoder.c
+++ b/source/libvpx/vpx/src/vpx_decoder.c
@@ -18,9 +18,13 @@
#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+ return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_dec_cfg_t *cfg,
+ const vpx_codec_dec_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver) {
vpx_codec_err_t res;
@@ -94,7 +98,7 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
si->w = 0;
si->h = 0;
- res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si);
+ res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
}
return SAVE_STATUS(ctx, res);
@@ -115,8 +119,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx,
else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
else {
- res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz,
- user_priv, deadline);
+ res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+ deadline);
}
return SAVE_STATUS(ctx, res);
@@ -129,7 +133,7 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx,
if (!ctx || !iter || !ctx->iface || !ctx->priv)
img = NULL;
else
- img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter);
+ img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
return img;
}
@@ -185,7 +189,7 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
res = VPX_CODEC_ERROR;
} else {
- res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+ res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
cb_priv);
}
diff --git a/source/libvpx/vpx/src/vpx_encoder.c b/source/libvpx/vpx/src/vpx_encoder.c
index 5773455..1903b55 100644
--- a/source/libvpx/vpx/src/vpx_encoder.c
+++ b/source/libvpx/vpx/src/vpx_encoder.c
@@ -20,9 +20,13 @@
#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+ return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_enc_cfg_t *cfg,
+ const vpx_codec_enc_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver) {
vpx_codec_err_t res;
@@ -216,7 +220,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx,
FLOATING_POINT_INIT();
if (num_enc == 1)
- res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
duration, flags, deadline);
else {
/* Multi-resolution encoding:
@@ -230,7 +234,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx,
if (img) img += num_enc - 1;
for (i = num_enc - 1; i >= 0; i--) {
- if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
duration, flags, deadline)))
break;
@@ -259,7 +263,7 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
ctx->err = VPX_CODEC_INCAPABLE;
else
- pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter);
+ pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
}
if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
@@ -327,7 +331,7 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) {
else if (!ctx->iface->enc.get_preview)
ctx->err = VPX_CODEC_INCAPABLE;
else
- img = ctx->iface->enc.get_preview(ctx->priv->alg_priv);
+ img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
}
return img;
@@ -345,7 +349,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) {
else if (!ctx->iface->enc.get_glob_hdrs)
ctx->err = VPX_CODEC_INCAPABLE;
else
- buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv);
+ buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
}
return buf;
@@ -361,7 +365,7 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
res = VPX_CODEC_INCAPABLE;
else
- res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg);
+ res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
return SAVE_STATUS(ctx, res);
}
diff --git a/source/libvpx/vpx/src/vpx_image.c b/source/libvpx/vpx/src/vpx_image.c
index e20703a..e58b61e 100644
--- a/source/libvpx/vpx/src/vpx_image.c
+++ b/source/libvpx/vpx/src/vpx_image.c
@@ -154,7 +154,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
goto fail;
img->fmt = fmt;
- img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
+ img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
img->w = w;
img->h = h;
img->x_chroma_shift = xcs;
diff --git a/source/libvpx/vpx/svc_context.h b/source/libvpx/vpx/svc_context.h
index e0de263..eea3b13 100644
--- a/source/libvpx/vpx/svc_context.h
+++ b/source/libvpx/vpx/svc_context.h
@@ -31,7 +31,8 @@ typedef enum SVC_LOG_LEVEL {
typedef struct {
// public interface to svc_command options
- int spatial_layers; // number of layers
+ int spatial_layers; // number of spatial layers
+ int temporal_layers; // number of temporal layers
SVC_LOG_LEVEL log_level; // amount of information to display
int log_print; // when set, printf log messages instead of returning the
// message with svc_get_message
diff --git a/source/libvpx/vpx/vp8dx.h b/source/libvpx/vpx/vp8dx.h
index bd7f19c..379b306 100644
--- a/source/libvpx/vpx/vp8dx.h
+++ b/source/libvpx/vpx/vp8dx.h
@@ -75,6 +75,9 @@ enum vp8_dec_control_id {
/** control function to get the display dimensions for the current frame. */
VP9D_GET_DISPLAY_SIZE,
+ /** control function to get the bit depth of the stream. */
+ VP9D_GET_BIT_DEPTH,
+
/** For testing. */
VP9_INVERT_TILE_DECODE_ORDER,
@@ -118,6 +121,7 @@ VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
/*! @} - end defgroup vp8_decoder */
diff --git a/source/libvpx/vpx/vpx_codec.h b/source/libvpx/vpx/vpx_codec.h
index 07df72a..b25308e 100644
--- a/source/libvpx/vpx/vpx_codec.h
+++ b/source/libvpx/vpx/vpx_codec.h
@@ -203,9 +203,11 @@ extern "C" {
const char *err_detail; /**< Detailed info, if available */
vpx_codec_flags_t init_flags; /**< Flags passed at init time */
union {
- struct vpx_codec_dec_cfg *dec; /**< Decoder Configuration Pointer */
- struct vpx_codec_enc_cfg *enc; /**< Encoder Configuration Pointer */
- void *raw;
+ /**< Decoder Configuration Pointer */
+ const struct vpx_codec_dec_cfg *dec;
+ /**< Encoder Configuration Pointer */
+ const struct vpx_codec_enc_cfg *enc;
+ const void *raw;
} config; /**< Configuration pointer aliasing union */
vpx_codec_priv_t *priv; /**< Algorithm private storage */
} vpx_codec_ctx_t;
@@ -215,9 +217,9 @@ extern "C" {
* This enumeration determines the bit depth of the codec.
*/
typedef enum vpx_bit_depth {
- VPX_BITS_8, /**< 8 bits */
- VPX_BITS_10, /**< 10 bits */
- VPX_BITS_12 /**< 12 bits */
+ VPX_BITS_8 = 8, /**< 8 bits */
+ VPX_BITS_10 = 10, /**< 10 bits */
+ VPX_BITS_12 = 12, /**< 12 bits */
} vpx_bit_depth_t;
/*
diff --git a/source/libvpx/vpx/vpx_decoder.h b/source/libvpx/vpx/vpx_decoder.h
index 10b89fa..62fd919 100644
--- a/source/libvpx/vpx/vpx_decoder.h
+++ b/source/libvpx/vpx/vpx_decoder.h
@@ -135,7 +135,7 @@ extern "C" {
*/
vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_dec_cfg_t *cfg,
+ const vpx_codec_dec_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver);
diff --git a/source/libvpx/vpx/vpx_encoder.h b/source/libvpx/vpx/vpx_encoder.h
index 7dbbf2f..fdabed1 100644
--- a/source/libvpx/vpx/vpx_encoder.h
+++ b/source/libvpx/vpx/vpx_encoder.h
@@ -80,6 +80,9 @@ extern "C" {
*/
#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000
/*! \brief Initialization-time Feature Enabling
*
@@ -91,6 +94,7 @@ extern "C" {
#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one
partition at a time. */
+#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
/*!\brief Generic fixed size buffer structure
@@ -188,14 +192,14 @@ extern "C" {
has id 0.*/
} frame; /**< data for compressed frame packet */
- struct vpx_fixed_buf twopass_stats; /**< data for two-pass packet */
- struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */
+ vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */
+ vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
struct vpx_psnr_pkt {
unsigned int samples[4]; /**< Number of samples, total/y/u/v */
uint64_t sse[4]; /**< sum squared error, total/y/u/v */
double psnr[4]; /**< PSNR, total/y/u/v */
} psnr; /**< data for PSNR packet */
- struct vpx_fixed_buf raw; /**< data for arbitrary packets */
+ vpx_fixed_buf_t raw; /**< data for arbitrary packets */
#if CONFIG_SPATIAL_SVC
size_t layer_sizes[VPX_SS_MAX_LAYERS];
#endif
@@ -324,6 +328,21 @@ extern "C" {
*/
unsigned int g_h;
+ /*!\brief Bit-depth of the codec
+ *
+ * This value identifies the bit_depth of the codec,
+ * Only certain bit-depths are supported as identified in the
+ * vpx_bit_depth_t enum.
+ */
+ vpx_bit_depth_t g_bit_depth;
+
+ /*!\brief Bit-depth of the input frames
+ *
+ * This value identifies the bit_depth of the input frames in bits.
+ * Note that the frames passed as input to the encoder must have
+ * this bit-depth.
+ */
+ unsigned int g_input_bit_depth;
/*!\brief Stream timebase units
*
@@ -452,14 +471,14 @@ extern "C" {
* A buffer containing all of the stats packets produced in the first
* pass, concatenated.
*/
- struct vpx_fixed_buf rc_twopass_stats_in;
+ vpx_fixed_buf_t rc_twopass_stats_in;
/*!\brief first pass mb stats buffer.
*
* A buffer containing all of the first pass mb stats packets produced
* in the first pass, concatenated.
*/
- struct vpx_fixed_buf rc_firstpass_mb_stats_in;
+ vpx_fixed_buf_t rc_firstpass_mb_stats_in;
/*!\brief Target data rate
*
@@ -715,7 +734,7 @@ extern "C" {
*/
vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_enc_cfg_t *cfg,
+ const vpx_codec_enc_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver);
diff --git a/source/libvpx/vpx/vpx_image.h b/source/libvpx/vpx/vpx_image.h
index 7b04b70..0b7bb90 100644
--- a/source/libvpx/vpx/vpx_image.h
+++ b/source/libvpx/vpx/vpx_image.h
@@ -31,10 +31,10 @@ extern "C" {
#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
-#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format */
-#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U plane in memory */
-#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel component */
-#define VPX_IMG_FMT_HIGH 0x800 /**< Image uses 16bit framebuffer */
+#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
/*!\brief List of supported image formats */
typedef enum vpx_img_fmt {
@@ -59,9 +59,9 @@ extern "C" {
VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
- VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH,
- VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH,
- VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH
+ VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+ VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+ VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH
} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
diff --git a/source/libvpx/vpx_mem/vpx_mem.c b/source/libvpx/vpx_mem/vpx_mem.c
index 059248b..da61642 100644
--- a/source/libvpx/vpx_mem/vpx_mem.c
+++ b/source/libvpx/vpx_mem/vpx_mem.c
@@ -16,6 +16,7 @@
#include <stdlib.h>
#include <string.h>
#include "include/vpx_mem_intrnl.h"
+#include "vpx/vpx_integer.h"
#if CONFIG_MEM_TRACKER
#ifndef VPX_NO_GLOBALS
@@ -452,6 +453,29 @@ void *vpx_memset(void *dest, int val, size_t length) {
return VPX_MEMSET_L(dest, val, length);
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+void *vpx_memset16(void *dest, int val, size_t length) {
+#if CONFIG_MEM_CHECKS
+ if ((int)dest < 0x4000) {
+ _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n",
+ (int)dest, val, length);)
+
+#if defined(VXWORKS)
+ sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
+
+ vx_sleep(10000);
+#endif
+ }
+#endif
+ int i;
+ void *orig = dest;
+ uint16_t *dest16 = dest;
+ for (i = 0; i < length; i++)
+ *dest16++ = val;
+ return orig;
+}
+#endif // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+
void *vpx_memmove(void *dest, const void *src, size_t count) {
#if CONFIG_MEM_CHECKS
diff --git a/source/libvpx/vpx_mem/vpx_mem.h b/source/libvpx/vpx_mem/vpx_mem.h
index 33686b2..e2391f4 100644
--- a/source/libvpx/vpx_mem/vpx_mem.h
+++ b/source/libvpx/vpx_mem/vpx_mem.h
@@ -73,6 +73,9 @@ extern "C" {
void *vpx_memcpy(void *dest, const void *src, size_t length);
void *vpx_memset(void *dest, int val, size_t length);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ void *vpx_memset16(void *dest, int val, size_t length);
+#endif
void *vpx_memmove(void *dest, const void *src, size_t count);
/* special memory functions */
diff --git a/source/libvpx/vpx_scale/generic/yv12config.c b/source/libvpx/vpx_scale/generic/yv12config.c
index 827bce7..70d7ac0 100644
--- a/source/libvpx/vpx_scale/generic/yv12config.c
+++ b/source/libvpx/vpx_scale/generic/yv12config.c
@@ -13,6 +13,9 @@
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
#include "vpx_mem/vpx_mem.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
/****************************************************************************
* Exports
@@ -136,7 +139,11 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height,
- int ss_x, int ss_y, int border,
+ int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
void *cb_priv) {
@@ -161,11 +168,21 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
const int alpha_border_h = border;
const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
(uint64_t)alpha_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint64_t frame_size = (1 + use_highbitdepth) *
+ (yplane_size + 2 * uvplane_size + alpha_plane_size);
+#else
const uint64_t frame_size = yplane_size + 2 * uvplane_size +
alpha_plane_size;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#else
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint64_t frame_size =
+ (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
#else
const uint64_t frame_size = yplane_size + 2 * uvplane_size;
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_ALPHA
if (cb != NULL) {
const int align_addr_extra_size = 31;
const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -231,11 +248,31 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
ybf->border = border;
ybf->frame_size = (int)frame_size;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (use_highbitdepth) {
+ // Store uint16 addresses when using 16bit framebuffers
+ uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+ ybf->y_buffer = p + (border * y_stride) + border;
+ ybf->u_buffer = p + yplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->v_buffer = p + yplane_size + uvplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+ ybf->u_buffer = ybf->buffer_alloc + yplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->flags = 0;
+ }
+#else
ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
ybf->u_buffer = ybf->buffer_alloc + yplane_size +
(uv_border_h * uv_stride) + uv_border_w;
ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
(uv_border_h * uv_stride) + uv_border_w;
+#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_ALPHA
ybf->alpha_width = alpha_width;
@@ -252,11 +289,18 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height,
- int ss_x, int ss_y, int border) {
+ int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border) {
if (ybf) {
vp9_free_frame_buffer(ybf);
- return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
- NULL, NULL, NULL);
+ return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, NULL, NULL, NULL);
}
return -2;
}
diff --git a/source/libvpx/vpx_scale/generic/yv12extend.c b/source/libvpx/vpx_scale/generic/yv12extend.c
index 036a505..0485452 100644
--- a/source/libvpx/vpx_scale/generic/yv12extend.c
+++ b/source/libvpx/vpx_scale/generic/yv12extend.c
@@ -13,6 +13,9 @@
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
static void extend_plane(uint8_t *const src, int src_stride,
int width, int height,
@@ -55,6 +58,50 @@ static void extend_plane(uint8_t *const src, int src_stride,
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride,
+ int width, int height,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i;
+ const int linesize = extend_left + extend_right + width;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ /* copy the left and right most columns out */
+ uint16_t *src_ptr1 = src;
+ uint16_t *src_ptr2 = src + width - 1;
+ uint16_t *dst_ptr1 = src - extend_left;
+ uint16_t *dst_ptr2 = src + width;
+
+ for (i = 0; i < height; ++i) {
+ vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_stride;
+ src_ptr2 += src_stride;
+ dst_ptr1 += src_stride;
+ dst_ptr2 += src_stride;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = src - extend_left;
+ src_ptr2 = src + src_stride * (height - 1) - extend_left;
+ dst_ptr1 = src + src_stride * -extend_top - extend_left;
+ dst_ptr2 = src + src_stride * height - extend_left;
+
+ for (i = 0; i < extend_top; ++i) {
+ vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ dst_ptr1 += src_stride;
+ }
+
+ for (i = 0; i < extend_bottom; ++i) {
+ vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ dst_ptr2 += src_stride;
+ }
+}
+#endif
+
void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
const int uv_border = ybf->border / 2;
@@ -64,6 +111,31 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(
+ ybf->y_buffer, ybf->y_stride,
+ ybf->y_crop_width, ybf->y_crop_height,
+ ybf->border, ybf->border,
+ ybf->border + ybf->y_height - ybf->y_crop_height,
+ ybf->border + ybf->y_width - ybf->y_crop_width);
+
+ extend_plane_high(
+ ybf->u_buffer, ybf->uv_stride,
+ (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+ ybf->border / 2, ybf->border / 2,
+ (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+ (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+
+ extend_plane_high(
+ ybf->v_buffer, ybf->uv_stride,
+ (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+ ybf->border / 2, ybf->border / 2,
+ (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+ (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+ return;
+ }
+#endif
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ybf->border, ybf->border,
@@ -99,6 +171,20 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(ybf->y_buffer, ybf->y_stride,
+ ybf->y_crop_width, ybf->y_crop_height,
+ ext_size, ext_size,
+ ext_size + ybf->y_height - ybf->y_crop_height,
+ ext_size + ybf->y_width - ybf->y_crop_width);
+ extend_plane_high(ybf->u_buffer, ybf->uv_stride,
+ c_w, c_h, c_et, c_el, c_eb, c_er);
+ extend_plane_high(ybf->v_buffer, ybf->uv_stride,
+ c_w, c_h, c_et, c_el, c_eb, c_er);
+ return;
+ }
+#endif
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ext_size, ext_size,
@@ -121,6 +207,14 @@ void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
VP9INNERBORDERINPIXELS : ybf->border;
extend_frame(ybf, inner_bw);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ vpx_memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP9
// Copies the source image into the destination image and updates the
@@ -140,6 +234,40 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
assert(src_ybc->y_height == dst_ybc->y_height);
#endif
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->y_width);
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+
+ src = src_ybc->u_buffer;
+ dst = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ src = src_ybc->v_buffer;
+ dst = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_c(dst_ybc);
+ return;
+ } else {
+ assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
+ }
+#endif
+
for (row = 0; row < src_ybc->y_height; ++row) {
vpx_memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
@@ -173,6 +301,19 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
const uint8_t *src = src_ybc->y_buffer;
uint8_t *dst = dst_ybc->y_buffer;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+ src16 += src_ybc->y_stride;
+ dst16 += dst_ybc->y_stride;
+ }
+ return;
+ }
+#endif
+
for (row = 0; row < src_ybc->y_height; ++row) {
vpx_memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
diff --git a/source/libvpx/vpx_scale/yv12config.h b/source/libvpx/vpx_scale/yv12config.h
index cdde75c..eb0a8d6 100644
--- a/source/libvpx/vpx_scale/yv12config.h
+++ b/source/libvpx/vpx_scale/yv12config.h
@@ -55,6 +55,8 @@ typedef struct yv12_buffer_config {
int flags;
} YV12_BUFFER_CONFIG;
+#define YV12_FLAG_HIGHBITDEPTH 1
+
int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int border);
int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
@@ -63,6 +65,9 @@ int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
int border);
// Updates the yv12 buffer config with the frame buffer. If cb is not
@@ -73,6 +78,9 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
// on failure.
int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
int border,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
diff --git a/source/libvpx/vpxdec.c b/source/libvpx/vpxdec.c
index 6c822ab..6470081 100644
--- a/source/libvpx/vpxdec.c
+++ b/source/libvpx/vpxdec.c
@@ -90,12 +90,20 @@ static const arg_def_t fb_arg =
static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
"Compute the MD5 sum of the decoded frame");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t outbitdeptharg = ARG_DEF(
+ NULL, "output-bit-depth", 1,
+ "Output bit-depth for decoded frames");
+#endif
static const arg_def_t *all_args[] = {
&codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
&progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
&threadsarg, &verbosearg, &scalearg, &fb_arg,
&md5arg, &error_concealment, &continuearg,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &outbitdeptharg,
+#endif
NULL
};
@@ -129,6 +137,26 @@ static const arg_def_t *vp8_pp_args[] = {
#if CONFIG_LIBYUV
static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
FilterModeEnum mode) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src->fmt == VPX_IMG_FMT_I42016) {
+ assert(dst->fmt == VPX_IMG_FMT_I42016);
+ return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
+ src->stride[VPX_PLANE_Y]/2,
+ (uint16_t*)src->planes[VPX_PLANE_U],
+ src->stride[VPX_PLANE_U]/2,
+ (uint16_t*)src->planes[VPX_PLANE_V],
+ src->stride[VPX_PLANE_V]/2,
+ src->d_w, src->d_h,
+ (uint16_t*)dst->planes[VPX_PLANE_Y],
+ dst->stride[VPX_PLANE_Y]/2,
+ (uint16_t*)dst->planes[VPX_PLANE_U],
+ dst->stride[VPX_PLANE_U]/2,
+ (uint16_t*)dst->planes[VPX_PLANE_V],
+ dst->stride[VPX_PLANE_V]/2,
+ dst->d_w, dst->d_h,
+ mode);
+ }
+#endif
assert(src->fmt == VPX_IMG_FMT_I420);
assert(dst->fmt == VPX_IMG_FMT_I420);
return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
@@ -265,6 +293,11 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3],
static void write_image_file(const vpx_image_t *img, const int planes[3],
FILE *file) {
int i, y;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+#else
+ const int bytes_per_sample = 1;
+#endif
for (i = 0; i < 3; ++i) {
const int plane = planes[i];
@@ -274,7 +307,7 @@ static void write_image_file(const vpx_image_t *img, const int planes[3],
const int h = vpx_img_plane_height(img, plane);
for (y = 0; y < h; ++y) {
- fwrite(buf, 1, w, file);
+ fwrite(buf, bytes_per_sample, w, file);
buf += stride;
}
}
@@ -494,6 +527,178 @@ static FILE *open_outfile(const char *name) {
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+ }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_upshift(dst, src, input_shift);
+ } else {
+ low_img_upshift(dst, src, input_shift);
+ }
+}
+
+static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+}
+
+static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+ }
+}
+
+static void img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_downshift(dst, src, down_shift);
+ } else {
+ low_img_downshift(dst, src, down_shift);
+ }
+}
+#endif
+
int main_loop(int argc, const char **argv_) {
vpx_codec_ctx_t decoder;
char *fn = NULL;
@@ -518,6 +723,9 @@ int main_loop(int argc, const char **argv_) {
int opt_yv12 = 0;
int opt_i420 = 0;
vpx_codec_dec_cfg_t cfg = {0, 0, 0};
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ int output_bit_depth = 0;
+#endif
#if CONFIG_VP8_DECODER
vp8_postproc_cfg_t vp8_pp_cfg = {0};
int vp8_dbg_color_ref_frame = 0;
@@ -529,6 +737,9 @@ int main_loop(int argc, const char **argv_) {
int dec_flags = 0;
int do_scale = 0;
vpx_image_t *scaled_img = NULL;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t *img_shifted = NULL;
+#endif
int frame_avail, got_data;
int num_external_frame_buffers = 0;
struct ExternalFrameBufferList ext_fb_list = {0, NULL};
@@ -569,6 +780,9 @@ int main_loop(int argc, const char **argv_) {
use_y4m = 0;
flipuv = 1;
opt_yv12 = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ output_bit_depth = 8; // For yv12 8-bit depth output is assumed
+#endif
} else if (arg_match(&arg, &use_i420, argi)) {
use_y4m = 0;
flipuv = 0;
@@ -599,7 +813,13 @@ int main_loop(int argc, const char **argv_) {
do_scale = 1;
else if (arg_match(&arg, &fb_arg, argi))
num_external_frame_buffers = arg_parse_uint(&arg);
-
+ else if (arg_match(&arg, &continuearg, argi))
+ keep_going = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ else if (arg_match(&arg, &outbitdeptharg, argi)) {
+ output_bit_depth = arg_parse_uint(&arg);
+ }
+#endif
#if CONFIG_VP8_DECODER
else if (arg_match(&arg, &addnoise_level, argi)) {
postproc = 1;
@@ -649,11 +869,8 @@ int main_loop(int argc, const char **argv_) {
}
} else if (arg_match(&arg, &error_concealment, argi)) {
ec_enabled = 1;
- } else if (arg_match(&arg, &continuearg, argi)) {
- keep_going = 1;
}
-
-#endif
+#endif // CONFIG_VP8_DECODER
else
argj++;
}
@@ -889,7 +1106,7 @@ int main_loop(int argc, const char **argv_) {
display_height = display_size[1];
}
}
- scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, display_width,
+ scaled_img = vpx_img_alloc(NULL, img->fmt, display_width,
display_height, 16);
scaled_img->bit_depth = img->bit_depth;
}
@@ -907,6 +1124,33 @@ int main_loop(int argc, const char **argv_) {
#endif
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ // Default to codec bit depth if output bit depth not set
+ if (!output_bit_depth) {
+ output_bit_depth = img->bit_depth;
+ }
+ // Shift up or down if necessary
+ if (output_bit_depth != img->bit_depth) {
+ if (!img_shifted) {
+ if (output_bit_depth == 8) {
+ img_shifted = vpx_img_alloc(
+ NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ img->d_w, img->d_h, 16);
+ } else {
+ img_shifted = vpx_img_alloc(
+ NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+ img->d_w, img->d_h, 16);
+ }
+ img_shifted->bit_depth = output_bit_depth;
+ }
+ if (output_bit_depth > img->bit_depth) {
+ img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+ } else {
+ img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+ }
+ img = img_shifted;
+ }
+#endif
if (single_file) {
if (use_y4m) {
@@ -1013,6 +1257,9 @@ fail:
free(buf);
if (scaled_img) vpx_img_free(scaled_img);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img_shifted) vpx_img_free(img_shifted);
+#endif
for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
free(ext_fb_list.ext_fb[i].data);
diff --git a/source/libvpx/vpxenc.c b/source/libvpx/vpxenc.c
index b99e61a..5afca24 100644
--- a/source/libvpx/vpxenc.c
+++ b/source/libvpx/vpxenc.c
@@ -200,6 +200,10 @@ static const arg_def_t experimental_bitstream =
ARG_DEF(NULL, "experimental-bitstream", 0,
"Allow experimental bitstream features.");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t test16bitinternalarg = ARG_DEF(
+ NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
+#endif
static const arg_def_t *main_args[] = {
&debugmode,
@@ -248,6 +252,9 @@ static const arg_def_t *global_args[] = {
#endif
&timebase, &framerate,
&error_resilient,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &test16bitinternalarg,
+#endif
&lag_in_frames, NULL
};
@@ -321,7 +328,7 @@ static const arg_def_t *kf_args[] = {
static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
"Noise sensitivity (frames to blur)");
static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
- "Filter sharpness (0-7)");
+ "Loop filter sharpness (0..7)");
static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
"Motion detection threshold");
static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
@@ -329,11 +336,11 @@ static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
"Enable automatic alt reference frames");
static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
- "AltRef Max Frames");
+ "AltRef max frames (0..15)");
static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
- "AltRef Strength");
+ "AltRef filter strength (0..6)");
static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
- "AltRef Type");
+ "AltRef type");
static const struct arg_enum_list tuning_enum[] = {
{"psnr", VP8_TUNE_PSNR},
{"ssim", VP8_TUNE_SSIM},
@@ -378,9 +385,26 @@ static const arg_def_t aq_mode = ARG_DEF(
"Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
"3: cyclic refresh)");
static const arg_def_t frame_periodic_boost = ARG_DEF(
- NULL, "frame_boost", 1,
+ NULL, "frame-boost", 1,
"Enable frame periodic boost (0: off (default), 1: on)");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+ {"8", VPX_BITS_8},
+ {"10", VPX_BITS_10},
+ {"12", VPX_BITS_12},
+ {NULL, 0}
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM("b", "bit-depth", 1,
+ "Bit depth for codec "
+ "(8 for version <=1, "
+ "10 or 12 for version 2)",
+ bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1,
+ "Bit depth of input");
+#endif
+
static const struct arg_enum_list tune_content_enum[] = {
{"default", VP9E_CONTENT_DEFAULT},
{"screen", VP9E_CONTENT_SCREEN},
@@ -395,6 +419,9 @@ static const arg_def_t *vp9_args[] = {
&tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
&tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
&frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &bitdeptharg, &inbitdeptharg,
+#endif
NULL
};
static const int vp9_arg_ctrl_map[] = {
@@ -450,6 +477,102 @@ void usage_exit() {
}
#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void find_mismatch_high(const vpx_image_t *const img1,
+ const vpx_image_t *const img2,
+ int yloc[4], int uloc[4], int vloc[4]) {
+ uint16_t *plane1, *plane2;
+ uint32_t stride1, stride2;
+ const uint32_t bsize = 64;
+ const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+ const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+ const uint32_t c_w =
+ (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+ const uint32_t c_h =
+ (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+ int match = 1;
+ uint32_t i, j;
+ yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
+ stride1 = img1->stride[VPX_PLANE_Y]/2;
+ stride2 = img2->stride[VPX_PLANE_Y]/2;
+ for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+ for (j = 0; match && j < img1->d_w; j += bsize) {
+ int k, l;
+ const int si = mmin(i + bsize, img1->d_h) - i;
+ const int sj = mmin(j + bsize, img1->d_w) - j;
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ yloc[0] = i + k;
+ yloc[1] = j + l;
+ yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
+ stride1 = img1->stride[VPX_PLANE_U]/2;
+ stride2 = img2->stride[VPX_PLANE_U]/2;
+ for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+ for (j = 0; match && j < c_w; j += bsizex) {
+ int k, l;
+ const int si = mmin(i + bsizey, c_h - i);
+ const int sj = mmin(j + bsizex, c_w - j);
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ uloc[0] = i + k;
+ uloc[1] = j + l;
+ uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
+ stride1 = img1->stride[VPX_PLANE_V]/2;
+ stride2 = img2->stride[VPX_PLANE_V]/2;
+ for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+ for (j = 0; match && j < c_w; j += bsizex) {
+ int k, l;
+ const int si = mmin(i + bsizey, c_h - i);
+ const int sj = mmin(j + bsizex, c_w - j);
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ vloc[0] = i + k;
+ vloc[1] = j + l;
+ vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+#endif
+
static void find_mismatch(const vpx_image_t *const img1,
const vpx_image_t *const img2,
int yloc[4], int uloc[4], int vloc[4]) {
@@ -542,7 +665,8 @@ static void find_mismatch(const vpx_image_t *const img1,
static int compare_img(const vpx_image_t *const img1,
const vpx_image_t *const img2) {
- const uint32_t c_w =
+ uint32_t l_w = img1->d_w;
+ uint32_t c_w =
(img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
const uint32_t c_h =
(img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
@@ -552,11 +676,17 @@ static int compare_img(const vpx_image_t *const img1,
match &= (img1->fmt == img2->fmt);
match &= (img1->d_w == img2->d_w);
match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ l_w *= 2;
+ c_w *= 2;
+ }
+#endif
for (i = 0; i < img1->d_h; ++i)
match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
- img1->d_w) == 0);
+ l_w) == 0);
for (i = 0; i < c_h; ++i)
match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
@@ -601,6 +731,10 @@ struct stream_config {
int arg_ctrl_cnt;
int write_webm;
int have_kf_max_dist;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ // whether to use 16bit internal buffers
+ int use_16bit_internal;
+#endif
};
@@ -740,8 +874,9 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
#if CONFIG_VP9_ENCODER
// Make default VP9 passes = 2 until there is a better quality 1-pass
// encoder
- global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
- global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+ if (global->codec != NULL && global->codec->name != NULL)
+ global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+ global->deadline != VPX_DL_REALTIME) ? 2 : 1;
#else
global->passes = 1;
#endif
@@ -809,8 +944,10 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
struct stream_state *stream;
stream = calloc(1, sizeof(*stream));
- if (!stream)
+ if (stream == NULL) {
fatal("Failed to allocate new stream.");
+ }
+
if (prev) {
memcpy(stream, prev, sizeof(*stream));
stream->index++;
@@ -870,6 +1007,9 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
static const int *ctrl_args_map = NULL;
struct stream_config *config = &stream->config;
int eos_mark_found = 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ int test_16bit_internal = 0;
+#endif
// Handle codec specific options
if (0) {
@@ -918,6 +1058,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
config->cfg.g_w = arg_parse_uint(&arg);
} else if (arg_match(&arg, &height, argi)) {
config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ } else if (arg_match(&arg, &bitdeptharg, argi)) {
+ config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+ } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+ config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+#endif
#if CONFIG_WEBM_IO
} else if (arg_match(&arg, &stereo_mode, argi)) {
config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -985,6 +1131,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
config->have_kf_max_dist = 1;
} else if (arg_match(&arg, &kf_disabled, argi)) {
config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
+ if (strcmp(global->codec->name, "vp9") == 0) {
+ test_16bit_internal = 1;
+ }
+#endif
} else {
int i, match = 0;
for (i = 0; ctrl_args[i]; i++) {
@@ -996,12 +1148,13 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
* instance of this control.
*/
for (j = 0; j < config->arg_ctrl_cnt; j++)
- if (config->arg_ctrls[j][0] == ctrl_args_map[i])
+ if (ctrl_args_map != NULL &&
+ config->arg_ctrls[j][0] == ctrl_args_map[i])
break;
/* Update/insert */
assert(j < (int)ARG_CTRL_CNT_MAX);
- if (j < (int)ARG_CTRL_CNT_MAX) {
+ if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
config->arg_ctrls[j][0] = ctrl_args_map[i];
config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
if (j == config->arg_ctrl_cnt)
@@ -1014,6 +1167,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
argj++;
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (strcmp(global->codec->name, "vp9") == 0) {
+ config->use_16bit_internal = test_16bit_internal |
+ (config->cfg.g_profile > 1);
+ }
+#endif
return eos_mark_found;
}
@@ -1041,6 +1200,14 @@ static void validate_stream_config(const struct stream_state *stream,
experimental_bitstream.long_name);
}
+ // Check that the codec bit depth is greater than the input bit depth.
+ if (stream->config.cfg.g_input_bit_depth >
+ (unsigned int)stream->config.cfg.g_bit_depth) {
+ fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+ stream->index, (int)stream->config.cfg.g_bit_depth,
+ stream->config.cfg.g_input_bit_depth);
+ }
+
for (streami = stream; streami; streami = streami->next) {
/* All streams require output files */
if (!streami->config.out_fn)
@@ -1149,6 +1316,8 @@ static void show_stream_config(struct stream_state *stream,
SHOW(g_profile);
SHOW(g_w);
SHOW(g_h);
+ SHOW(g_bit_depth);
+ SHOW(g_input_bit_depth);
SHOW(g_timebase.num);
SHOW(g_timebase.den);
SHOW(g_error_resilient);
@@ -1281,6 +1450,9 @@ static void initialize_encoder(struct stream_state *stream,
flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0;
+#endif
/* Construct Encoder Context */
vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
@@ -1326,6 +1498,46 @@ static void encode_frame(struct stream_state *stream,
/ cfg->g_timebase.num / global->framerate.num;
/* Scale if necessary */
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img) {
+ if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
+ (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+ if (img->fmt != VPX_IMG_FMT_I42016) {
+ fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+ exit(EXIT_FAILURE);
+ }
+#if CONFIG_LIBYUV
+ if (!stream->img) {
+ stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
+ cfg->g_w, cfg->g_h, 16);
+ }
+ I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
+ img->stride[VPX_PLANE_Y]/2,
+ (uint16*)img->planes[VPX_PLANE_U],
+ img->stride[VPX_PLANE_U]/2,
+ (uint16*)img->planes[VPX_PLANE_V],
+ img->stride[VPX_PLANE_V]/2,
+ img->d_w, img->d_h,
+ (uint16*)stream->img->planes[VPX_PLANE_Y],
+ stream->img->stride[VPX_PLANE_Y]/2,
+ (uint16*)stream->img->planes[VPX_PLANE_U],
+ stream->img->stride[VPX_PLANE_U]/2,
+ (uint16*)stream->img->planes[VPX_PLANE_V],
+ stream->img->stride[VPX_PLANE_V]/2,
+ stream->img->d_w, stream->img->d_h,
+ kFilterBox);
+ img = stream->img;
+#else
+ stream->encoder.err = 1;
+ ctx_exit_on_error(&stream->encoder,
+ "Stream %d: Failed to encode frame.\n"
+ "Scaling disabled in this configuration. \n"
+ "To enable, configure with --enable-libyuv\n",
+ stream->index);
+#endif
+ }
+ }
+#endif
if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
@@ -1504,6 +1716,131 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) {
return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->w != src->w || dst->h != src->h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->w;
+ int h = src->h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->w != src->w || dst->h != src->h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->w;
+ int h = src->h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+ }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_upshift(dst, src, input_shift);
+ } else {
+ low_img_upshift(dst, src, input_shift);
+ }
+}
+
+static void img_cast_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+ int plane;
+ if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+ dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = *p_src++;
+ }
+ }
+ }
+}
+#endif
static void test_decode(struct stream_state *stream,
enum TestDecodeFatality fatal,
@@ -1530,20 +1867,44 @@ static void test_decode(struct stream_state *stream,
vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
} else {
- struct vp9_ref_frame ref;
+ struct vp9_ref_frame ref_enc, ref_dec;
- ref.idx = 0;
- vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);
- enc_img = ref.img;
- vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);
- dec_img = ref.img;
+ ref_enc.idx = 0;
+ ref_dec.idx = 0;
+ vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
+ enc_img = ref_enc.img;
+ vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
+ dec_img = ref_dec.img;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+ (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+ if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ enc_img.d_w, enc_img.d_h, 16);
+ img_cast_16_to_8(&enc_img, &ref_enc.img);
+ }
+ if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ dec_img.d_w, dec_img.d_h, 16);
+ img_cast_16_to_8(&dec_img, &ref_dec.img);
+ }
+ }
+#endif
}
ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
if (!compare_img(&enc_img, &dec_img)) {
int y[4], u[4], v[4];
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ find_mismatch_high(&enc_img, &dec_img, y, u, v);
+ } else {
+ find_mismatch(&enc_img, &dec_img, y, u, v);
+ }
+#else
find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
stream->decoder.err = 1;
warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
"Stream %d: Encode/decode mismatch on frame %d at"
@@ -1585,6 +1946,12 @@ static void print_time(const char *label, int64_t etl) {
int main(int argc, const char **argv_) {
int pass;
vpx_image_t raw;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t raw_shift;
+ int allocated_raw_shift = 0;
+ int use_16bit_internal = 0;
+ int input_shift = 0;
+#endif
int frame_avail, got_data;
struct VpxInputContext input;
@@ -1686,6 +2053,27 @@ int main(int argc, const char **argv_) {
if (!input.width || !input.height)
fatal("Specify stream dimensions with --width (-w) "
" and --height (-h)");
+
+ /* If input file does not specify bit-depth but input-bit-depth parameter
+ * exists, assume that to be the input bit-depth. However, if the
+ * input-bit-depth paramter does not exist, assume the input bit-depth
+ * to be the same as the codec bit-depth.
+ */
+ if (!input.bit_depth) {
+ FOREACH_STREAM({
+ if (stream->config.cfg.g_input_bit_depth)
+ input.bit_depth = stream->config.cfg.g_input_bit_depth;
+ else
+ input.bit_depth = stream->config.cfg.g_input_bit_depth =
+ (int)stream->config.cfg.g_bit_depth;
+ });
+ if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+ } else {
+ FOREACH_STREAM({
+ stream->config.cfg.g_input_bit_depth = input.bit_depth;
+ });
+ }
+
FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
FOREACH_STREAM(validate_stream_config(stream, &global));
@@ -1739,6 +2127,25 @@ int main(int argc, const char **argv_) {
FOREACH_STREAM(open_output_file(stream, &global));
FOREACH_STREAM(initialize_encoder(stream, &global));
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (strcmp(global.codec->name, "vp9") == 0) {
+ // Check to see if at least one stream uses 16 bit internal.
+ // Currently assume that the bit_depths for all streams using
+ // highbitdepth are the same.
+ FOREACH_STREAM({
+ if (stream->config.use_16bit_internal) {
+ use_16bit_internal = 1;
+ }
+ if (stream->config.cfg.g_profile == 0) {
+ input_shift = 0;
+ } else {
+ input_shift = (int)stream->config.cfg.g_bit_depth -
+ stream->config.cfg.g_input_bit_depth;
+ }
+ });
+ }
+#endif
+
frame_avail = 1;
got_data = 0;
@@ -1776,10 +2183,45 @@ int main(int argc, const char **argv_) {
frame_avail = 0;
if (frames_in > global.skip_frames) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t *frame_to_encode;
+ if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+ assert(use_16bit_internal);
+ // Input bit depth and stream bit depth do not match, so up
+ // shift frame to stream bit depth
+ if (!allocated_raw_shift) {
+ vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+ input.width, input.height, 32);
+ allocated_raw_shift = 1;
+ }
+ img_upshift(&raw_shift, &raw, input_shift);
+ frame_to_encode = &raw_shift;
+ } else {
+ frame_to_encode = &raw;
+ }
+ vpx_usec_timer_start(&timer);
+ if (use_16bit_internal) {
+ assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH);
+ FOREACH_STREAM({
+ if (stream->config.use_16bit_internal)
+ encode_frame(stream, &global,
+ frame_avail ? frame_to_encode : NULL,
+ frames_in);
+ else
+ assert(0);
+ });
+ } else {
+ assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0);
+ FOREACH_STREAM(encode_frame(stream, &global,
+ frame_avail ? frame_to_encode : NULL,
+ frames_in));
+ }
+#else
vpx_usec_timer_start(&timer);
FOREACH_STREAM(encode_frame(stream, &global,
frame_avail ? &raw : NULL,
frames_in));
+#endif
vpx_usec_timer_mark(&timer);
cx_time += vpx_usec_timer_elapsed(&timer);
@@ -1788,7 +2230,8 @@ int main(int argc, const char **argv_) {
got_data = 0;
FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
- if (!got_data && input.length && !streams->frames_out) {
+ if (!got_data && input.length && streams != NULL &&
+ !streams->frames_out) {
lagged_count = global.limit ? seen_frames : ftello(input.file);
} else if (input.length) {
int64_t remaining;
@@ -1896,6 +2339,10 @@ int main(int argc, const char **argv_) {
});
#endif
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (allocated_raw_shift)
+ vpx_img_free(&raw_shift);
+#endif
vpx_img_free(&raw);
free(argv);
free(streams);
diff --git a/source/libvpx/y4minput.c b/source/libvpx/y4minput.c
index 520c332..bcc742a 100644
--- a/source/libvpx/y4minput.c
+++ b/source/libvpx/y4minput.c
@@ -700,7 +700,7 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
int only_420) {
- char buffer[80];
+ char buffer[80] = {0};
int ret;
int i;
/*Read until newline, or 80 cols, whichever happens first.*/
@@ -978,7 +978,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
_y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
else
_y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
- _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+
+ if (_y4m->aux_buf_sz > 0)
+ _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
return 0;
}