diff options
author | johannkoenig@chromium.org <johannkoenig@chromium.org> | 2014-09-08 18:46:28 +0000 |
---|---|---|
committer | johannkoenig@chromium.org <johannkoenig@chromium.org> | 2014-09-08 18:46:28 +0000 |
commit | d95585fb0ec024f6abd96f7b02e0df58019d46af (patch) | |
tree | 099198c9fe84d7f873666002a1d5c63421785899 | |
parent | 0f393e92b0e220eeaa6acb0ad31e696fa5c67ccd (diff) | |
download | libvpx-d95585fb0ec024f6abd96f7b02e0df58019d46af.tar.gz |
libvpx: Pull from upstream
Current HEAD: c731d6a4f19eea861ceb2ff31399420b2452eb74
git log from upstream:
395f2e8 vp8 encoder: remove vp8_yv12_copy_partial_frame_neon
980abf6 Fixing Mac OS build.
fcd431f libyuv: cherry-pick MSVC arm build fix
1f19ebb Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics.
1dd9a63 Correct the mode decisions in special cases
1100e26 Removing postproc mmx code.
c97f5e8 vp8 common: change 'HAVE_NEON_ASM' to 'HAVE_NEON' for compiling functions of NEON intrinsics.
a808344 fix x86-darwin* build
35fadf1 bilinearpredict_neon: fix type conversion warnings
bb4950d vp9: correct context buffer resize check
440f509 vp9: fail decode if block/frame refs are corrupt
dbdb87b Fix a visual studio warning
d435148 Enable adaptive motion search for ARF coding
b1153f3 Map motion magnitude in VP9 denoiser.
7897059 Adding temp cpi var.
91998e6 Removing sz member from vpx_codec_priv.
d75266f Update the condition when COPY_BLOCK is chosen.
4909435 Removing unused function prototypes.
202edb3 Actually resetting random generator for all variance test cases.
e30f769 Fix a bug in VP9 denoiser.
ec94967 Revert "Revert "VP8 for ARMv8 by using NEON intrinsics 10""
a51704d vp8 common: change 'HAVE_NEON_ASM' to 'HAVE_NEON' for compiling idct_blk_neon.c.
0002da3 arm: Fix building vp8_subpixelvariance_neon.c with MSVC
48197f0 Adding sse2 variant for vp9_mse{8x8, 8x16, 16x8}.
<...>
TBR=tomfinegan@chromium.org
Review URL: https://codereview.chromium.org/554673004
git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libvpx@291859 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
230 files changed, 8210 insertions, 8057 deletions
diff --git a/README.chromium b/README.chromium index 749aa13..d1b5262 100644 --- a/README.chromium +++ b/README.chromium @@ -5,9 +5,9 @@ License: BSD License File: source/libvpx/LICENSE Security Critical: yes -Date: Thursday August 21 2014 +Date: Monday September 08 2014 Branch: master -Commit: 23c88870ec514b0dd7d22b9db99ae63f46c7d87f +Commit: c731d6a4f19eea861ceb2ff31399420b2452eb74 Description: Contains the sources used to compile libvpx binaries used by Google Chrome and diff --git a/libvpx_srcs.gni b/libvpx_srcs.gni index 30c227d..39e8a68 100644 --- a/libvpx_srcs.gni +++ b/libvpx_srcs.gni @@ -347,7 +347,6 @@ libvpx_srcs_x86_assembly = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm", @@ -355,8 +354,6 @@ libvpx_srcs_x86_assembly = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_ports/emms.asm", "//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm", ] @@ -364,7 +361,6 @@ libvpx_srcs_x86_mmx = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c", ] libvpx_srcs_x86_sse2 = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c", @@ -743,7 +739,6 @@ libvpx_srcs_x86_64_assembly = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm", @@ -752,8 +747,6 @@ libvpx_srcs_x86_64_assembly = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_ports/emms.asm", "//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm", ] @@ -761,7 +754,6 @@ libvpx_srcs_x86_64_mmx = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c", ] libvpx_srcs_x86_64_sse2 = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c", @@ -1157,20 +1149,19 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.h", @@ -1250,10 +1241,8 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm", + "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c", @@ -1538,13 +1527,6 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.h", @@ -1623,9 +1605,6 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c", "//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c", @@ -1879,14 +1858,21 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c", + "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c", @@ -1909,13 +1895,20 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_blk_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", + "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.h", @@ -1989,6 +1982,7 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp8/decoder/treereader.h", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/dct_arm.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c", + "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c", diff --git a/libvpx_srcs_arm64.gypi b/libvpx_srcs_arm64.gypi index 545ff22..a6c51b1 100644 --- a/libvpx_srcs_arm64.gypi +++ b/libvpx_srcs_arm64.gypi @@ -15,13 +15,20 @@ '<(libvpx_source)/vp8/common/arm/neon/dc_only_idct_add_neon.c', '<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c', '<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c', '<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c', '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', '<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sad_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/common/arm/variance_arm.c', '<(libvpx_source)/vp8/common/blockd.c', '<(libvpx_source)/vp8/common/blockd.h', @@ -95,6 +102,7 @@ '<(libvpx_source)/vp8/decoder/treereader.h', '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', '<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c', + '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp8/encoder/arm/quantize_arm.c', diff --git a/libvpx_srcs_arm_neon.gypi b/libvpx_srcs_arm_neon.gypi index 80973bb..2ce983a 100644 --- a/libvpx_srcs_arm_neon.gypi +++ b/libvpx_srcs_arm_neon.gypi @@ -39,20 +39,19 @@ '<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c', '<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c', '<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c', '<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.asm', + '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c', '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm', + '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', '<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sad_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm', + '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/common/arm/variance_arm.c', '<(libvpx_source)/vp8/common/blockd.c', '<(libvpx_source)/vp8/common/blockd.h', @@ -132,10 +131,8 @@ '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', '<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.asm', - '<(libvpx_source)/vp8/encoder/arm/neon/picklpf_arm.c', - '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.asm', + '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', - '<(libvpx_source)/vp8/encoder/arm/neon/vp8_memcpy_neon.asm', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp8/encoder/arm/quantize_arm.c', diff --git a/libvpx_srcs_arm_neon_cpu_detect.gypi b/libvpx_srcs_arm_neon_cpu_detect.gypi index 796f50b..2af52a7 100644 --- a/libvpx_srcs_arm_neon_cpu_detect.gypi +++ b/libvpx_srcs_arm_neon_cpu_detect.gypi @@ -33,13 +33,6 @@ '<(libvpx_source)/vp8/common/arm/dequantize_arm.c', '<(libvpx_source)/vp8/common/arm/filter_arm.c', '<(libvpx_source)/vp8/common/arm/loopfilter_arm.c', - '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm', - '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm', '<(libvpx_source)/vp8/common/arm/variance_arm.c', '<(libvpx_source)/vp8/common/blockd.c', '<(libvpx_source)/vp8/common/blockd.h', @@ -118,9 +111,6 @@ '<(libvpx_source)/vp8/encoder/arm/armv6/walsh_v6.asm', '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', '<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.asm', - '<(libvpx_source)/vp8/encoder/arm/neon/picklpf_arm.c', - '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.asm', - '<(libvpx_source)/vp8/encoder/arm/neon/vp8_memcpy_neon.asm', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm', '<(libvpx_source)/vp8/encoder/arm/quantize_arm.c', '<(libvpx_source)/vp8/encoder/bitstream.c', diff --git a/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi b/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi index 2fa1ba6..07eab36 100644 --- a/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi +++ b/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi @@ -19,14 +19,21 @@ '<(libvpx_source)/vp8/common/arm/neon/dequant_idct_neon.c', '<(libvpx_source)/vp8/common/arm/neon/dequantizeb_neon.c', '<(libvpx_source)/vp8/common/arm/neon/idct_blk_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_0_2x_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/idct_dequant_full_2x_neon.c', '<(libvpx_source)/vp8/common/arm/neon/iwalsh_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/loopfilter_neon.c', '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', '<(libvpx_source)/vp8/common/arm/neon/mbloopfilter_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sad_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', + '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c', + '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_convolve_neon.c', diff --git a/libvpx_srcs_x86.gypi b/libvpx_srcs_x86.gypi index 91791a6..f5b3995 100644 --- a/libvpx_srcs_x86.gypi +++ b/libvpx_srcs_x86.gypi @@ -309,7 +309,6 @@ '<(libvpx_source)/vp9/encoder/x86/vp9_dct_mmx.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad4d_sse2.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_sad_mmx.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse3.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse4.asm', @@ -317,8 +316,6 @@ '<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_subtract_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_mmx.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_sse2.asm', '<(libvpx_source)/vp9/vp9_cx_iface.c', '<(libvpx_source)/vp9/vp9_dx_iface.c', '<(libvpx_source)/vp9/vp9_iface_common.h', diff --git a/libvpx_srcs_x86_64.gypi b/libvpx_srcs_x86_64.gypi index 07c46a7..f29c6c8 100644 --- a/libvpx_srcs_x86_64.gypi +++ b/libvpx_srcs_x86_64.gypi @@ -314,7 +314,6 @@ '<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad4d_sse2.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_sad_mmx.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse3.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_sad_sse4.asm', @@ -323,8 +322,6 @@ '<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_subtract_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_mmx.asm', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_sse2.asm', '<(libvpx_source)/vp9/vp9_cx_iface.c', '<(libvpx_source)/vp9/vp9_dx_iface.c', '<(libvpx_source)/vp9/vp9_iface_common.h', diff --git a/libvpx_srcs_x86_64_intrinsics.gypi b/libvpx_srcs_x86_64_intrinsics.gypi index a47c1e5..bb1b203 100644 --- a/libvpx_srcs_x86_64_intrinsics.gypi +++ b/libvpx_srcs_x86_64_intrinsics.gypi @@ -16,7 +16,6 @@ '<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c', '<(libvpx_source)/vp8/common/x86/variance_mmx.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_mmx.c', ], 'cflags': [ '-mmmx', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] }, diff --git a/libvpx_srcs_x86_intrinsics.gypi b/libvpx_srcs_x86_intrinsics.gypi index a47c1e5..bb1b203 100644 --- a/libvpx_srcs_x86_intrinsics.gypi +++ b/libvpx_srcs_x86_intrinsics.gypi @@ -16,7 +16,6 @@ '<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c', '<(libvpx_source)/vp8/common/x86/variance_mmx.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_mmx.c', ], 'cflags': [ '-mmmx', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] }, diff --git a/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h index 67936cc..9e41308 100644 --- a/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h +++ b/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h @@ -59,10 +59,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_c void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c +void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c +void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); void vp8_clear_system_state_c(); #define vp8_clear_system_state vp8_clear_system_state_c @@ -420,10 +422,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -RTCD_EXTERN void (*vp8_yv12_copy_partial_frame)(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); - void vp8_rtcd(void); #include "vpx_config.h" @@ -444,6 +442,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_armv6; if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_NEON) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_neon; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_NEON) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_neon; vp8_copy_mem16x16 = vp8_copy_mem16x16_v6; if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; vp8_copy_mem8x4 = vp8_copy_mem8x4_v6; @@ -544,8 +546,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_neon; vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_armv6; if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_neon; - vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_c; - if (flags & HAS_NEON) vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_neon; } #endif diff --git a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h index fac25a0..c4da123 100644 --- a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h +++ b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm index f8b3a15..5e15c83 100644 --- a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm +++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.h b/source/config/linux/arm-neon-cpu-detect/vpx_config.h index 9cfd076..0bb6cee 100644 --- a/source/config/linux/arm-neon-cpu-detect/vpx_config.h +++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/arm-neon/vp8_rtcd.h b/source/config/linux/arm-neon/vp8_rtcd.h index 00f2e61..703294a 100644 --- a/source/config/linux/arm-neon/vp8_rtcd.h +++ b/source/config/linux/arm-neon/vp8_rtcd.h @@ -59,10 +59,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_c void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c +void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c +void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon void vp8_clear_system_state_c(); #define vp8_clear_system_state vp8_clear_system_state_c @@ -420,10 +422,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_neon - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/arm-neon/vp9_rtcd.h b/source/config/linux/arm-neon/vp9_rtcd.h index ff6a27e..cd2cc54 100644 --- a/source/config/linux/arm-neon/vp9_rtcd.h +++ b/source/config/linux/arm-neon/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/arm-neon/vpx_config.asm b/source/config/linux/arm-neon/vpx_config.asm index a9eab27..fbd36f1 100644 --- a/source/config/linux/arm-neon/vpx_config.asm +++ b/source/config/linux/arm-neon/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/linux/arm-neon/vpx_config.h b/source/config/linux/arm-neon/vpx_config.h index c497ddb..b858039 100644 --- a/source/config/linux/arm-neon/vpx_config.h +++ b/source/config/linux/arm-neon/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/arm/vp8_rtcd.h b/source/config/linux/arm/vp8_rtcd.h index ec35c11..780d938 100644 --- a/source/config/linux/arm/vp8_rtcd.h +++ b/source/config/linux/arm/vp8_rtcd.h @@ -366,9 +366,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6 -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/arm/vp9_rtcd.h b/source/config/linux/arm/vp9_rtcd.h index 0ebc52b..2be563e 100644 --- a/source/config/linux/arm/vp9_rtcd.h +++ b/source/config/linux/arm/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/arm/vpx_config.asm b/source/config/linux/arm/vpx_config.asm index dd8be51..d8c8989 100644 --- a/source/config/linux/arm/vpx_config.asm +++ b/source/config/linux/arm/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/linux/arm/vpx_config.h b/source/config/linux/arm/vpx_config.h index ee5f10d..5967658 100644 --- a/source/config/linux/arm/vpx_config.h +++ b/source/config/linux/arm/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/arm64/vp8_rtcd.h b/source/config/linux/arm64/vp8_rtcd.h index 9d45b89..f1b86d0 100644 --- a/source/config/linux/arm64/vp8_rtcd.h +++ b/source/config/linux/arm64/vp8_rtcd.h @@ -55,10 +55,12 @@ int vp8_block_error_c(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_c void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c +void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c +void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon void vp8_clear_system_state_c(); #define vp8_clear_system_state vp8_clear_system_state_c @@ -92,10 +94,12 @@ void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, i #define vp8_dequant_idct_add vp8_dequant_idct_add_neon void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon void vp8_dequantize_b_c(struct blockd*, short *dqc); void vp8_dequantize_b_neon(struct blockd*, short *dqc); @@ -132,10 +136,12 @@ void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left #define vp8_intra4x4_predict vp8_intra4x4_predict_c void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_c +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bv vp8_loop_filter_bv_c +void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -150,14 +156,16 @@ void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon int vp8_mbblock_error_c(struct macroblock *mb, int dc); #define vp8_mbblock_error vp8_mbblock_error_c @@ -267,10 +275,12 @@ void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned #define vp8_sad8x8x8 vp8_sad8x8x8_c void vp8_short_fdct4x4_c(short *input, short *output, int pitch); -#define vp8_short_fdct4x4 vp8_short_fdct4x4_c +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon void vp8_short_fdct8x4_c(short *input, short *output, int pitch); -#define vp8_short_fdct8x4 vp8_short_fdct8x4_c +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); @@ -360,9 +370,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/arm64/vp9_rtcd.h b/source/config/linux/arm64/vp9_rtcd.h index 582837a..176e7af 100644 --- a/source/config/linux/arm64/vp9_rtcd.h +++ b/source/config/linux/arm64/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/arm64/vpx_config.asm b/source/config/linux/arm64/vpx_config.asm index bb141a1..a03bced 100644 --- a/source/config/linux/arm64/vpx_config.asm +++ b/source/config/linux/arm64/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/linux/arm64/vpx_config.h b/source/config/linux/arm64/vpx_config.h index e791223..06f3045 100644 --- a/source/config/linux/arm64/vpx_config.h +++ b/source/config/linux/arm64/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/generic/vp8_rtcd.h b/source/config/linux/generic/vp8_rtcd.h index 298886d..79edff7 100644 --- a/source/config/linux/generic/vp8_rtcd.h +++ b/source/config/linux/generic/vp8_rtcd.h @@ -323,9 +323,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/generic/vp9_rtcd.h b/source/config/linux/generic/vp9_rtcd.h index c2df3fb..5c9b779 100644 --- a/source/config/linux/generic/vp9_rtcd.h +++ b/source/config/linux/generic/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/generic/vpx_config.asm b/source/config/linux/generic/vpx_config.asm index 42f23e4..b2fa7be 100644 --- a/source/config/linux/generic/vpx_config.asm +++ b/source/config/linux/generic/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/linux/generic/vpx_config.h b/source/config/linux/generic/vpx_config.h index 75d1415..a16afde 100644 --- a/source/config/linux/generic/vpx_config.h +++ b/source/config/linux/generic/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/ia32/vp8_rtcd.h b/source/config/linux/ia32/vp8_rtcd.h index 4dc2d75..fd88326 100644 --- a/source/config/linux/ia32/vp8_rtcd.h +++ b/source/config/linux/ia32/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/linux/ia32/vp9_rtcd.h b/source/config/linux/ia32/vp9_rtcd.h index 5d4bb2f..aa34a25 100644 --- a/source/config/linux/ia32/vp9_rtcd.h +++ b/source/config/linux/ia32/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); @@ -286,12 +277,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8 RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); unsigned int vp9_get_mb_ss_sse2(const int16_t *); RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); @@ -420,18 +409,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c @@ -449,7 +440,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -482,7 +472,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -545,7 +534,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -610,7 +598,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -645,7 +632,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -834,7 +820,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -843,7 +828,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -860,7 +844,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -877,7 +860,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -886,7 +868,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -994,10 +975,8 @@ static void setup_rtcd_internal(void) vp9_get16x16var = vp9_get16x16var_c; if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx; if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; @@ -1060,10 +1039,14 @@ static void setup_rtcd_internal(void) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; + vp9_mse16x8 = vp9_mse16x8_c; + if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; + vp9_mse8x16 = vp9_mse8x16_c; + if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; + vp9_mse8x8 = vp9_mse8x8_c; + if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; vp9_sad16x16_avg = vp9_sad16x16_avg_c; if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; @@ -1079,7 +1062,6 @@ static void setup_rtcd_internal(void) vp9_sad16x32x4d = vp9_sad16x32x4d_c; if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; vp9_sad16x8_avg = vp9_sad16x8_avg_c; if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; @@ -1107,7 +1089,6 @@ static void setup_rtcd_internal(void) vp9_sad32x64x4d = vp9_sad32x64x4d_c; if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; vp9_sad4x4_avg = vp9_sad4x4_avg_c; if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; @@ -1134,7 +1115,6 @@ static void setup_rtcd_internal(void) vp9_sad64x64x4d = vp9_sad64x64x4d_c; if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; vp9_sad8x16_avg = vp9_sad8x16_avg_c; if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; @@ -1149,7 +1129,6 @@ static void setup_rtcd_internal(void) vp9_sad8x4x4d = vp9_sad8x4x4d_c; if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; vp9_sad8x8_avg = vp9_sad8x8_avg_c; if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; @@ -1254,12 +1233,10 @@ static void setup_rtcd_internal(void) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; vp9_variance16x32 = vp9_variance16x32_c; if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; vp9_variance32x16 = vp9_variance32x16_c; if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; @@ -1268,7 +1245,6 @@ static void setup_rtcd_internal(void) vp9_variance32x64 = vp9_variance32x64_c; if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; vp9_variance4x8 = vp9_variance4x8_c; if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; @@ -1277,12 +1253,10 @@ static void setup_rtcd_internal(void) vp9_variance64x64 = vp9_variance64x64_c; if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; vp9_variance8x4 = vp9_variance8x4_c; if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; } #endif diff --git a/source/config/linux/ia32/vpx_config.asm b/source/config/linux/ia32/vpx_config.asm index ddde8b0..a340007 100644 --- a/source/config/linux/ia32/vpx_config.asm +++ b/source/config/linux/ia32/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/ia32/vpx_config.h b/source/config/linux/ia32/vpx_config.h index 705af6e..5b8fc38 100644 --- a/source/config/linux/ia32/vpx_config.h +++ b/source/config/linux/ia32/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/mips64el/vp8_rtcd.h b/source/config/linux/mips64el/vp8_rtcd.h index 58dc2fb..9848bb8 100644 --- a/source/config/linux/mips64el/vp8_rtcd.h +++ b/source/config/linux/mips64el/vp8_rtcd.h @@ -326,9 +326,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/mips64el/vp9_rtcd.h b/source/config/linux/mips64el/vp9_rtcd.h index c2df3fb..5c9b779 100644 --- a/source/config/linux/mips64el/vp9_rtcd.h +++ b/source/config/linux/mips64el/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/mips64el/vpx_config.h b/source/config/linux/mips64el/vpx_config.h index 934484e..736b66a 100644 --- a/source/config/linux/mips64el/vpx_config.h +++ b/source/config/linux/mips64el/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/mipsel/vp8_rtcd.h b/source/config/linux/mipsel/vp8_rtcd.h index 58dc2fb..9848bb8 100644 --- a/source/config/linux/mipsel/vp8_rtcd.h +++ b/source/config/linux/mipsel/vp8_rtcd.h @@ -326,9 +326,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/linux/mipsel/vp9_rtcd.h b/source/config/linux/mipsel/vp9_rtcd.h index c2df3fb..5c9b779 100644 --- a/source/config/linux/mipsel/vp9_rtcd.h +++ b/source/config/linux/mipsel/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/linux/mipsel/vpx_config.h b/source/config/linux/mipsel/vpx_config.h index 5e0b6f2..e0bb723 100644 --- a/source/config/linux/mipsel/vpx_config.h +++ b/source/config/linux/mipsel/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/x64/vp8_rtcd.h b/source/config/linux/x64/vp8_rtcd.h index 7caa03a..b2fd3d2 100644 --- a/source/config/linux/x64/vp8_rtcd.h +++ b/source/config/linux/x64/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/linux/x64/vp9_rtcd.h b/source/config/linux/x64/vp9_rtcd.h index ed9a72b..4e8678a 100644 --- a/source/config/linux/x64/vp9_rtcd.h +++ b/source/config/linux/x64/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_sse2 @@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8 #define vp9_get16x16var vp9_get16x16var_sse2 void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vp9_get8x8var vp9_get8x8var_sse2 unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); unsigned int vp9_get_mb_ss_sse2(const int16_t *); #define vp9_get_mb_ss vp9_get_mb_ss_sse2 @@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_sse2 unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x16 vp9_sad16x16_sse2 @@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x8 vp9_sad16x8_sse2 @@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad4x4 vp9_sad4x4_sse @@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x16 vp9_sad8x16_sse2 @@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x8 vp9_sad8x8_sse2 @@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x16 vp9_variance16x16_sse2 @@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance16x32 vp9_variance16x32_sse2 unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x8 vp9_variance16x8_sse2 @@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance32x64 vp9_variance32x64_sse2 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance4x4 vp9_variance4x4_sse2 @@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance64x64 vp9_variance64x64_sse2 unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x16 vp9_variance8x16_sse2 @@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con #define vp9_variance8x4 vp9_variance8x4_sse2 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x8 vp9_variance8x8_sse2 diff --git a/source/config/linux/x64/vpx_config.asm b/source/config/linux/x64/vpx_config.asm index c34dcd3..1cc8999 100644 --- a/source/config/linux/x64/vpx_config.asm +++ b/source/config/linux/x64/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/linux/x64/vpx_config.h b/source/config/linux/x64/vpx_config.h index 8b99a23..e88c097 100644 --- a/source/config/linux/x64/vpx_config.h +++ b/source/config/linux/x64/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/mac/ia32/vp8_rtcd.h b/source/config/mac/ia32/vp8_rtcd.h index 4dc2d75..fd88326 100644 --- a/source/config/mac/ia32/vp8_rtcd.h +++ b/source/config/mac/ia32/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/mac/ia32/vp9_rtcd.h b/source/config/mac/ia32/vp9_rtcd.h index bd56bc3..fc9dc85 100644 --- a/source/config/mac/ia32/vp9_rtcd.h +++ b/source/config/mac/ia32/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c @@ -262,13 +253,10 @@ void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t #define vp9_get16x16var vp9_get16x16var_c void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_c unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_c void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -391,8 +379,7 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x16 vp9_mse16x16_c unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x8 vp9_mse16x8_c @@ -419,8 +406,7 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_c unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vp9_sad16x16_avg vp9_sad16x16_avg_c @@ -448,8 +434,7 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_c unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vp9_sad16x8_avg vp9_sad16x8_avg_c @@ -503,8 +488,7 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_c unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vp9_sad4x4_avg vp9_sad4x4_avg_c @@ -560,8 +544,7 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_c unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vp9_sad8x16_avg vp9_sad8x16_avg_c @@ -591,8 +574,7 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_c unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vp9_sad8x8_avg vp9_sad8x8_avg_c @@ -718,15 +700,13 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x16 vp9_variance16x16_c unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x32 vp9_variance16x32_c unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_c unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance32x16 vp9_variance32x16_c @@ -738,8 +718,7 @@ unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, cons #define vp9_variance32x64 vp9_variance32x64_c unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_c unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance4x8 vp9_variance4x8_c @@ -751,15 +730,13 @@ unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, cons #define vp9_variance64x64 vp9_variance64x64_c unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_c unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x4 vp9_variance8x4_c unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_c void vp9_rtcd(void); @@ -816,11 +793,6 @@ static void setup_rtcd_internal(void) vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx; - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; @@ -873,10 +845,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2; vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; - vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; vp9_sad16x16x3 = vp9_sad16x16x3_c; if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; @@ -884,8 +852,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; vp9_sad16x32x4d = vp9_sad16x32x4d_c; if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; - vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; vp9_sad16x8x3 = vp9_sad16x8x3_c; if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; @@ -897,8 +863,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; vp9_sad32x64x4d = vp9_sad32x64x4d_c; if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; - vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; vp9_sad4x4x3 = vp9_sad4x4x3_c; if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; vp9_sad4x4x4d = vp9_sad4x4x4d_c; @@ -909,32 +873,18 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; vp9_sad64x64x4d = vp9_sad64x64x4d_c; if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; - vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; vp9_sad8x16x3 = vp9_sad8x16x3_c; if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; vp9_sad8x16x4d = vp9_sad8x16x4d_c; if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; vp9_sad8x4x4d = vp9_sad8x4x4d_c; if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; - vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; vp9_sad8x8x3 = vp9_sad8x8x3_c; if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; vp9_sad8x8x4d = vp9_sad8x8x4d_c; if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; } #endif diff --git a/source/config/mac/ia32/vpx_config.asm b/source/config/mac/ia32/vpx_config.asm index d06f05d..54a6abd 100644 --- a/source/config/mac/ia32/vpx_config.asm +++ b/source/config/mac/ia32/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/mac/ia32/vpx_config.h b/source/config/mac/ia32/vpx_config.h index 9b7b399..c3e8947 100644 --- a/source/config/mac/ia32/vpx_config.h +++ b/source/config/mac/ia32/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/mac/x64/vp8_rtcd.h b/source/config/mac/x64/vp8_rtcd.h index 7caa03a..b2fd3d2 100644 --- a/source/config/mac/x64/vp8_rtcd.h +++ b/source/config/mac/x64/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/mac/x64/vp9_rtcd.h b/source/config/mac/x64/vp9_rtcd.h index ed9a72b..4e8678a 100644 --- a/source/config/mac/x64/vp9_rtcd.h +++ b/source/config/mac/x64/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_sse2 @@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8 #define vp9_get16x16var vp9_get16x16var_sse2 void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vp9_get8x8var vp9_get8x8var_sse2 unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); unsigned int vp9_get_mb_ss_sse2(const int16_t *); #define vp9_get_mb_ss vp9_get_mb_ss_sse2 @@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_sse2 unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x16 vp9_sad16x16_sse2 @@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x8 vp9_sad16x8_sse2 @@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad4x4 vp9_sad4x4_sse @@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x16 vp9_sad8x16_sse2 @@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x8 vp9_sad8x8_sse2 @@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x16 vp9_variance16x16_sse2 @@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance16x32 vp9_variance16x32_sse2 unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x8 vp9_variance16x8_sse2 @@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance32x64 vp9_variance32x64_sse2 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance4x4 vp9_variance4x4_sse2 @@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance64x64 vp9_variance64x64_sse2 unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x16 vp9_variance8x16_sse2 @@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con #define vp9_variance8x4 vp9_variance8x4_sse2 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x8 vp9_variance8x8_sse2 diff --git a/source/config/mac/x64/vpx_config.asm b/source/config/mac/x64/vpx_config.asm index c34dcd3..1cc8999 100644 --- a/source/config/mac/x64/vpx_config.asm +++ b/source/config/mac/x64/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/mac/x64/vpx_config.h b/source/config/mac/x64/vpx_config.h index 8b99a23..e88c097 100644 --- a/source/config/mac/x64/vpx_config.h +++ b/source/config/mac/x64/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/nacl/vp8_rtcd.h b/source/config/nacl/vp8_rtcd.h index 298886d..79edff7 100644 --- a/source/config/nacl/vp8_rtcd.h +++ b/source/config/nacl/vp8_rtcd.h @@ -323,9 +323,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #include "vpx_config.h" diff --git a/source/config/nacl/vp9_rtcd.h b/source/config/nacl/vp9_rtcd.h index c2df3fb..5c9b779 100644 --- a/source/config/nacl/vp9_rtcd.h +++ b/source/config/nacl/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c diff --git a/source/config/nacl/vpx_config.asm b/source/config/nacl/vpx_config.asm index 42f23e4..b2fa7be 100644 --- a/source/config/nacl/vpx_config.asm +++ b/source/config/nacl/vpx_config.asm @@ -82,6 +82,7 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 0 .equ CONFIG_SPATIAL_SVC , 0 diff --git a/source/config/nacl/vpx_config.h b/source/config/nacl/vpx_config.h index 75d1415..a16afde 100644 --- a/source/config/nacl/vpx_config.h +++ b/source/config/nacl/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/win/ia32/vp8_rtcd.h b/source/config/win/ia32/vp8_rtcd.h index 4dc2d75..fd88326 100644 --- a/source/config/win/ia32/vp8_rtcd.h +++ b/source/config/win/ia32/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/win/ia32/vp9_rtcd.h b/source/config/win/ia32/vp9_rtcd.h index 5d4bb2f..aa34a25 100644 --- a/source/config/win/ia32/vp9_rtcd.h +++ b/source/config/win/ia32/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); @@ -286,12 +277,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8 RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); unsigned int vp9_get_mb_ss_sse2(const int16_t *); RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); @@ -420,18 +409,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c @@ -449,7 +440,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -482,7 +472,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -545,7 +534,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -610,7 +598,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -645,7 +632,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); @@ -834,7 +820,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -843,7 +828,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -860,7 +844,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -877,7 +860,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -886,7 +868,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -994,10 +975,8 @@ static void setup_rtcd_internal(void) vp9_get16x16var = vp9_get16x16var_c; if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_MMX) vp9_get8x8var = vp9_get8x8var_mmx; if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; @@ -1060,10 +1039,14 @@ static void setup_rtcd_internal(void) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; + vp9_mse16x8 = vp9_mse16x8_c; + if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; + vp9_mse8x16 = vp9_mse8x16_c; + if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; + vp9_mse8x8 = vp9_mse8x8_c; + if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; vp9_sad16x16_avg = vp9_sad16x16_avg_c; if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; @@ -1079,7 +1062,6 @@ static void setup_rtcd_internal(void) vp9_sad16x32x4d = vp9_sad16x32x4d_c; if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; vp9_sad16x8_avg = vp9_sad16x8_avg_c; if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; @@ -1107,7 +1089,6 @@ static void setup_rtcd_internal(void) vp9_sad32x64x4d = vp9_sad32x64x4d_c; if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; vp9_sad4x4_avg = vp9_sad4x4_avg_c; if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; @@ -1134,7 +1115,6 @@ static void setup_rtcd_internal(void) vp9_sad64x64x4d = vp9_sad64x64x4d_c; if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; vp9_sad8x16_avg = vp9_sad8x16_avg_c; if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; @@ -1149,7 +1129,6 @@ static void setup_rtcd_internal(void) vp9_sad8x4x4d = vp9_sad8x4x4d_c; if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; vp9_sad8x8_avg = vp9_sad8x8_avg_c; if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; @@ -1254,12 +1233,10 @@ static void setup_rtcd_internal(void) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; vp9_variance16x32 = vp9_variance16x32_c; if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; vp9_variance32x16 = vp9_variance32x16_c; if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; @@ -1268,7 +1245,6 @@ static void setup_rtcd_internal(void) vp9_variance32x64 = vp9_variance32x64_c; if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; vp9_variance4x8 = vp9_variance4x8_c; if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; @@ -1277,12 +1253,10 @@ static void setup_rtcd_internal(void) vp9_variance64x64 = vp9_variance64x64_c; if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; vp9_variance8x4 = vp9_variance8x4_c; if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; } #endif diff --git a/source/config/win/ia32/vpx_config.asm b/source/config/win/ia32/vpx_config.asm index 87a317c..9522ee1 100644 --- a/source/config/win/ia32/vpx_config.asm +++ b/source/config/win/ia32/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/win/ia32/vpx_config.h b/source/config/win/ia32/vpx_config.h index 601cd8d..8fef84f 100644 --- a/source/config/win/ia32/vpx_config.h +++ b/source/config/win/ia32/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/win/x64/vp8_rtcd.h b/source/config/win/x64/vp8_rtcd.h index 7caa03a..b2fd3d2 100644 --- a/source/config/win/x64/vp8_rtcd.h +++ b/source/config/win/x64/vp8_rtcd.h @@ -480,9 +480,6 @@ unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, in unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c - void vp8_rtcd(void); #ifdef RTCD_C diff --git a/source/config/win/x64/vp9_rtcd.h b/source/config/win/x64/vp9_rtcd.h index ed9a72b..4e8678a 100644 --- a/source/config/win/x64/vp9_rtcd.h +++ b/source/config/win/x64/vp9_rtcd.h @@ -28,15 +28,6 @@ struct mv; union int_mv; struct yv12_buffer_config; -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_sse2 @@ -287,12 +278,10 @@ void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8 #define vp9_get16x16var vp9_get16x16var_sse2 void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vp9_get8x8var vp9_get8x8var_sse2 unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); unsigned int vp9_get_mb_ss_sse2(const int16_t *); #define vp9_get_mb_ss vp9_get_mb_ss_sse2 @@ -423,18 +412,20 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_sse2 unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -456,7 +447,6 @@ int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int #define vp9_refining_search_sad vp9_refining_search_sad_c unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x16 vp9_sad16x16_sse2 @@ -489,7 +479,6 @@ void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad16x8 vp9_sad16x8_sse2 @@ -552,7 +541,6 @@ void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t #define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad4x4 vp9_sad4x4_sse @@ -617,7 +605,6 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *re #define vp9_sad64x64x8 vp9_sad64x64x8_c unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x16 vp9_sad8x16_sse2 @@ -652,7 +639,6 @@ void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p #define vp9_sad8x4x8 vp9_sad8x4x8_c unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); #define vp9_sad8x8 vp9_sad8x8_sse2 @@ -842,7 +828,6 @@ void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x16 vp9_variance16x16_sse2 @@ -851,7 +836,6 @@ unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance16x32 vp9_variance16x32_sse2 unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance16x8 vp9_variance16x8_sse2 @@ -868,7 +852,6 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance32x64 vp9_variance32x64_sse2 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance4x4 vp9_variance4x4_sse2 @@ -885,7 +868,6 @@ unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, c #define vp9_variance64x64 vp9_variance64x64_sse2 unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x16 vp9_variance8x16_sse2 @@ -894,7 +876,6 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, con #define vp9_variance8x4 vp9_variance8x4_sse2 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vp9_variance8x8 vp9_variance8x8_sse2 diff --git a/source/config/win/x64/vpx_config.asm b/source/config/win/x64/vpx_config.asm index 9af49e7..f184ca7 100644 --- a/source/config/win/x64/vpx_config.asm +++ b/source/config/win/x64/vpx_config.asm @@ -79,6 +79,7 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 0 %define CONFIG_SPATIAL_SVC 0 diff --git a/source/config/win/x64/vpx_config.h b/source/config/win/x64/vpx_config.h index 9747cec..75777f0 100644 --- a/source/config/win/x64/vpx_config.h +++ b/source/config/win/x64/vpx_config.h @@ -91,6 +91,7 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 0 #define CONFIG_SPATIAL_SVC 0 diff --git a/source/libvpx/build/make/gen_msvs_proj.sh b/source/libvpx/build/make/gen_msvs_proj.sh index 3653309..7907225 100755 --- a/source/libvpx/build/make/gen_msvs_proj.sh +++ b/source/libvpx/build/make/gen_msvs_proj.sh @@ -245,13 +245,13 @@ esac case "$target" in x86_64*) platforms[0]="x64" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)"" - asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)"" + asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "\$(InputPath)"" + asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "\$(InputPath)"" ;; x86*) platforms[0]="Win32" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)"" - asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)"" + asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "\$(InputPath)"" + asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "\$(InputPath)"" ;; *) die "Unsupported target $target!" ;; diff --git a/source/libvpx/build/make/gen_msvs_vcxproj.sh b/source/libvpx/build/make/gen_msvs_vcxproj.sh index 23ef6a3..56b9a3b 100755 --- a/source/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/source/libvpx/build/make/gen_msvs_vcxproj.sh @@ -253,13 +253,13 @@ libs=${libs// /;} case "$target" in x86_64*) platforms[0]="x64" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "%(FullPath)"" ;; x86*) platforms[0]="Win32" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)"" ;; arm*) asm_Debug_cmdline="armasm -nologo "%(FullPath)"" diff --git a/source/libvpx/configure b/source/libvpx/configure index 2708b45..32b70f1 100755 --- a/source/libvpx/configure +++ b/source/libvpx/configure @@ -334,6 +334,7 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising coefficient_range_checking + vp9_highbitdepth experimental size_limit ${EXPERIMENT_LIST} @@ -392,6 +393,7 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising coefficient_range_checking + vp9_highbitdepth experimental " diff --git a/source/libvpx/examples.mk b/source/libvpx/examples.mk index 91bd45a..bd38c41 100644 --- a/source/libvpx/examples.mk +++ b/source/libvpx/examples.mk @@ -31,6 +31,7 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ third_party/libyuv/source/scale_common.cc \ third_party/libyuv/source/scale_mips.cc \ third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc \ third_party/libyuv/source/scale_posix.cc \ third_party/libyuv/source/scale_win.cc \ @@ -185,7 +186,9 @@ vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame ifeq ($(CONFIG_MULTI_RES_ENCODING),yes) ifeq ($(CONFIG_LIBYUV),yes) EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c +vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding diff --git a/source/libvpx/examples/set_maps.c b/source/libvpx/examples/set_maps.c index ff60d51..2ee5bca 100644 --- a/source/libvpx/examples/set_maps.c +++ b/source/libvpx/examples/set_maps.c @@ -42,6 +42,7 @@ // Use the `simple_decoder` example to decode this sample, and observe // the change in the image at frames 22, 33, and 44. +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -177,9 +178,10 @@ int main(int argc, char **argv) { memset(&info, 0, sizeof(info)); encoder = get_vpx_encoder_by_name(argv[1]); - if (!encoder) + if (encoder == NULL) { die("Unsupported codec."); - + } + assert(encoder != NULL); info.codec_fourcc = encoder->fourcc; info.frame_width = strtol(argv[2], NULL, 0); info.frame_height = strtol(argv[3], NULL, 0); diff --git a/source/libvpx/examples/twopass_encoder.c b/source/libvpx/examples/twopass_encoder.c index 369b1d8..76d5a28 100644 --- a/source/libvpx/examples/twopass_encoder.c +++ b/source/libvpx/examples/twopass_encoder.c @@ -66,13 +66,14 @@ void usage_exit() { exit(EXIT_FAILURE); } -static void get_frame_stats(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned int duration, - vpx_enc_frame_flags_t flags, - unsigned int deadline, - vpx_fixed_buf_t *stats) { +static int get_frame_stats(vpx_codec_ctx_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned int duration, + vpx_enc_frame_flags_t flags, + unsigned int deadline, + vpx_fixed_buf_t *stats) { + int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags, @@ -81,6 +82,8 @@ static void get_frame_stats(vpx_codec_ctx_t *ctx, die_codec(ctx, "Failed to get frame stats."); while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == VPX_CODEC_STATS_PKT) { const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; const size_t pkt_size = pkt->data.twopass_stats.sz; @@ -89,15 +92,18 @@ static void get_frame_stats(vpx_codec_ctx_t *ctx, stats->sz += pkt_size; } } + + return got_pkts; } -static void encode_frame(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned int duration, - vpx_enc_frame_flags_t flags, - unsigned int deadline, - VpxVideoWriter *writer) { +static int encode_frame(vpx_codec_ctx_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned int duration, + vpx_enc_frame_flags_t flags, + unsigned int deadline, + VpxVideoWriter *writer) { + int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags, @@ -106,6 +112,7 @@ static void encode_frame(vpx_codec_ctx_t *ctx, die_codec(ctx, "Failed to encode frame."); while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; @@ -117,19 +124,90 @@ static void encode_frame(vpx_codec_ctx_t *ctx, fflush(stdout); } } + + return got_pkts; +} + +static vpx_fixed_buf_t pass0(vpx_image_t *raw, + FILE *infile, + const VpxInterface *encoder, + const vpx_codec_enc_cfg_t *cfg) { + vpx_codec_ctx_t codec; + int frame_count = 0; + vpx_fixed_buf_t stats = {NULL, 0}; + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Calculate frame statistics. + while (vpx_img_read(raw, infile)) { + ++frame_count; + get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, + &stats); + } + + // Flush encoder. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, + VPX_DL_BEST_QUALITY, &stats)) {} + + printf("Pass 0 complete. Processed %d frames.\n", frame_count); + if (vpx_codec_destroy(&codec)) + die_codec(&codec, "Failed to destroy codec."); + + return stats; +} + +static void pass1(vpx_image_t *raw, + FILE *infile, + const char *outfile_name, + const VpxInterface *encoder, + const vpx_codec_enc_cfg_t *cfg) { + VpxVideoInfo info = { + encoder->fourcc, + cfg->g_w, + cfg->g_h, + {cfg->g_timebase.num, cfg->g_timebase.den} + }; + VpxVideoWriter *writer = NULL; + vpx_codec_ctx_t codec; + int frame_count = 0; + + writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) + die("Failed to open %s for writing", outfile_name); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Encode frames. + while (vpx_img_read(raw, infile)) { + ++frame_count; + encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_BEST_QUALITY, writer)) {} + + printf("\n"); + + if (vpx_codec_destroy(&codec)) + die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + printf("Pass 1 complete. Processed %d frames.\n", frame_count); } int main(int argc, char **argv) { FILE *infile = NULL; - VpxVideoWriter *writer = NULL; + int w, h; vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t cfg; vpx_image_t raw; vpx_codec_err_t res; - vpx_fixed_buf_t stats = {0}; - VpxVideoInfo info = {0}; + vpx_fixed_buf_t stats; + const VpxInterface *encoder = NULL; - int pass; const int fps = 30; // TODO(dkovalev) add command line argument const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument const char *const codec_arg = argv[1]; @@ -146,85 +224,44 @@ int main(int argc, char **argv) { if (!encoder) die("Unsupported codec."); - info.codec_fourcc = encoder->fourcc; - info.time_base.numerator = 1; - info.time_base.denominator = fps; - info.frame_width = strtol(width_arg, NULL, 0); - info.frame_height = strtol(height_arg, NULL, 0); - - if (info.frame_width <= 0 || - info.frame_height <= 0 || - (info.frame_width % 2) != 0 || - (info.frame_height % 2) != 0) { - die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); - } + w = strtol(width_arg, NULL, 0); + h = strtol(height_arg, NULL, 0); - if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, - info.frame_height, 1)) { - die("Failed to allocate image", info.frame_width, info.frame_height); - } + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); - writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing", outfile_arg); + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1)) + die("Failed to allocate image", w, h); printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + // Configuration res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); if (res) die_codec(&codec, "Failed to get default codec config."); - cfg.g_w = info.frame_width; - cfg.g_h = info.frame_height; - cfg.g_timebase.num = info.time_base.numerator; - cfg.g_timebase.den = info.time_base.denominator; + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; cfg.rc_target_bitrate = bitrate; - for (pass = 0; pass < 2; ++pass) { - int frame_count = 0; - - if (pass == 0) { - cfg.g_pass = VPX_RC_FIRST_PASS; - } else { - cfg.g_pass = VPX_RC_LAST_PASS; - cfg.rc_twopass_stats_in = stats; - } - - if (!(infile = fopen(infile_arg, "rb"))) - die("Failed to open %s for reading", infile_arg); - - if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); - - while (vpx_img_read(&raw, infile)) { - ++frame_count; + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); - if (pass == 0) { - get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, - &stats); - } else { - encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, - writer); - } - } - - if (pass == 0) { - get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY, - &stats); - } else { - printf("\n"); - } + // Pass 0 + cfg.g_pass = VPX_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg); - fclose(infile); - printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); - } - - vpx_img_free(&raw); + // Pass 1 + rewind(infile); + cfg.g_pass = VPX_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg); free(stats.buf); - vpx_video_writer_close(writer); + vpx_img_free(&raw); + fclose(infile); return EXIT_SUCCESS; } diff --git a/source/libvpx/examples/vp8_multi_resolution_encoder.c b/source/libvpx/examples/vp8_multi_resolution_encoder.c index d41e442..7c050fa 100644 --- a/source/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/source/libvpx/examples/vp8_multi_resolution_encoder.c @@ -8,446 +8,293 @@ * be found in the AUTHORS file in the root of the source tree. */ -/* - * This is an example demonstrating multi-resolution encoding in VP8. - * High-resolution input video is down-sampled to lower-resolutions. The - * encoder then encodes the video and outputs multiple bitstreams with - * different resolutions. - */ + +// This is an example demonstrating multi-resolution encoding in VP8. +// High-resolution input video is down-sampled to lower-resolutions. The +// encoder then encodes the video and outputs multiple bitstreams with +// different resolutions. +// +// Configure with --enable-multi-res-encoding flag to enable this example. + #include <stdio.h> #include <stdlib.h> -#include <stdarg.h> #include <string.h> -#include <math.h> -#define VPX_CODEC_DISABLE_COMPAT 1 -#include "vpx/vpx_encoder.h" -#include "vpx/vp8cx.h" -#include "vpx_ports/mem_ops.h" -#include "./tools_common.h" -#define interface (vpx_codec_vp8_cx()) -#define fourcc 0x30385056 - -void usage_exit() { - exit(EXIT_FAILURE); -} - -/* - * The input video frame is downsampled several times to generate a multi-level - * hierarchical structure. NUM_ENCODERS is defined as the number of encoding - * levels required. For example, if the size of input video is 1280x720, - * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3 - * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and - * 320x180(level 2) respectively. - */ -#define NUM_ENCODERS 3 -/* This example uses the scaler function in libyuv. */ #include "third_party/libyuv/include/libyuv/basic_types.h" #include "third_party/libyuv/include/libyuv/scale.h" #include "third_party/libyuv/include/libyuv/cpu_id.h" -int (*read_frame_p)(FILE *f, vpx_image_t *img); - -static int read_frame(FILE *f, vpx_image_t *img) { - size_t nbytes, to_read; - int res = 1; - - to_read = img->w*img->h*3/2; - nbytes = fread(img->planes[0], 1, to_read, f); - if(nbytes != to_read) { - res = 0; - if(nbytes > 0) - printf("Warning: Read partial frame. Check your width & height!\n"); - } - return res; -} - -static int read_frame_by_row(FILE *f, vpx_image_t *img) { - size_t nbytes, to_read; - int res = 1; - int plane; - - for (plane = 0; plane < 3; plane++) - { - unsigned char *ptr; - int w = (plane ? (1 + img->d_w) / 2 : img->d_w); - int h = (plane ? (1 + img->d_h) / 2 : img->d_h); - int r; - - /* Determine the correct plane based on the image format. The for-loop - * always counts in Y,U,V order, but this may not match the order of - * the data on disk. - */ - switch (plane) - { - case 1: - ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U]; - break; - case 2: - ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V]; - break; - default: - ptr = img->planes[plane]; - } - - for (r = 0; r < h; r++) - { - to_read = w; - - nbytes = fread(ptr, 1, to_read, f); - if(nbytes != to_read) { - res = 0; - if(nbytes > 0) - printf("Warning: Read partial frame. Check your width & height!\n"); - break; - } - - ptr += img->stride[plane]; - } - if (!res) - break; - } - - return res; -} - -static void write_ivf_file_header(FILE *outfile, - const vpx_codec_enc_cfg_t *cfg, - int frame_cnt) { - char header[32]; - - if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) - return; - header[0] = 'D'; - header[1] = 'K'; - header[2] = 'I'; - header[3] = 'F'; - mem_put_le16(header+4, 0); /* version */ - mem_put_le16(header+6, 32); /* headersize */ - mem_put_le32(header+8, fourcc); /* headersize */ - mem_put_le16(header+12, cfg->g_w); /* width */ - mem_put_le16(header+14, cfg->g_h); /* height */ - mem_put_le32(header+16, cfg->g_timebase.den); /* rate */ - mem_put_le32(header+20, cfg->g_timebase.num); /* scale */ - mem_put_le32(header+24, frame_cnt); /* length */ - mem_put_le32(header+28, 0); /* unused */ - - (void) fwrite(header, 1, 32, outfile); -} +#define VPX_CODEC_DISABLE_COMPAT 1 +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" -static void write_ivf_frame_header(FILE *outfile, - const vpx_codec_cx_pkt_t *pkt) -{ - char header[12]; - vpx_codec_pts_t pts; +#include "./tools_common.h" +#include "./video_writer.h" - if(pkt->kind != VPX_CODEC_CX_FRAME_PKT) - return; +// The input video frame is downsampled several times to generate a +// multi-level hierarchical structure. kNumEncoders is defined as the number +// of encoding levels required. For example, if the size of input video is +// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder +// outputs 3 bitstreams with resolution of 1280x720(level 0), +// 640x360(level 1), and 320x180(level 2) respectively. +#define kNumEncoders 3 - pts = pkt->data.frame.pts; - mem_put_le32(header, pkt->data.frame.sz); - mem_put_le32(header+4, pts&0xFFFFFFFF); - mem_put_le32(header+8, pts >> 32); +static const char *exec_name; - (void) fwrite(header, 1, 12, outfile); +void usage_exit() { + fprintf(stderr, + "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n", + exec_name); + exit(EXIT_FAILURE); } -int main(int argc, char **argv) -{ - FILE *infile, *outfile[NUM_ENCODERS]; - vpx_codec_ctx_t codec[NUM_ENCODERS]; - vpx_codec_enc_cfg_t cfg[NUM_ENCODERS]; - vpx_codec_pts_t frame_cnt = 0; - vpx_image_t raw[NUM_ENCODERS]; - vpx_codec_err_t res[NUM_ENCODERS]; - - int i; - long width; - long height; - int frame_avail; - int got_data; - int flags = 0; - - /*Currently, only realtime mode is supported in multi-resolution encoding.*/ - int arg_deadline = VPX_DL_REALTIME; - - /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you - don't need to know PSNR, which will skip PSNR calculation and save - encoding time. */ - int show_psnr = 0; - uint64_t psnr_sse_total[NUM_ENCODERS] = {0}; - uint64_t psnr_samples_total[NUM_ENCODERS] = {0}; - double psnr_totals[NUM_ENCODERS][4] = {{0,0}}; - int psnr_count[NUM_ENCODERS] = {0}; - - /* Set the required target bitrates for each resolution level. - * If target bitrate for highest-resolution level is set to 0, - * (i.e. target_bitrate[0]=0), we skip encoding at that level. - */ - unsigned int target_bitrate[NUM_ENCODERS]={1000, 500, 100}; - /* Enter the frame rate of the input video */ - int framerate = 30; - /* Set down-sampling factor for each resolution level. - dsf[0] controls down sampling from level 0 to level 1; - dsf[1] controls down sampling from level 1 to level 2; - dsf[2] is not used. */ - vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}}; - - if(argc!= (5+NUM_ENCODERS)) - die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n", - argv[0]); - - printf("Using %s\n",vpx_codec_iface_name(interface)); - - width = strtol(argv[1], NULL, 0); - height = strtol(argv[2], NULL, 0); - - if(width < 16 || width%2 || height <16 || height%2) - die("Invalid resolution: %ldx%ld", width, height); - - /* Open input video file for encoding */ - if(!(infile = fopen(argv[3], "rb"))) - die("Failed to open %s for reading", argv[3]); - - /* Open output file for each encoder to output bitstreams */ - for (i=0; i< NUM_ENCODERS; i++) - { - if(!target_bitrate[i]) - { - outfile[i] = NULL; - continue; - } - - if(!(outfile[i] = fopen(argv[i+4], "wb"))) - die("Failed to open %s for writing", argv[i+4]); - } - - show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0); - - /* Populate default encoder configuration */ - for (i=0; i< NUM_ENCODERS; i++) - { - res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0); - if(res[i]) { - printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i])); - return EXIT_FAILURE; - } +int main(int argc, char *argv[]) { + int frame_cnt = 0; + FILE *infile = NULL; + VpxVideoWriter *writers[kNumEncoders]; + vpx_codec_ctx_t codec[kNumEncoders]; + vpx_codec_enc_cfg_t cfg[kNumEncoders]; + vpx_image_t raw[kNumEncoders]; + const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8"); + // Currently, only realtime mode is supported in multi-resolution encoding. + const int arg_deadline = VPX_DL_REALTIME; + int i; + int width = 0; + int height = 0; + int frame_avail = 0; + int got_data = 0; + + // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you + // don't need to know PSNR, which will skip PSNR calculation and save + // encoding time. + int show_psnr = 0; + uint64_t psnr_sse_total[kNumEncoders] = {0}; + uint64_t psnr_samples_total[kNumEncoders] = {0}; + double psnr_totals[kNumEncoders][4] = {{0, 0}}; + int psnr_count[kNumEncoders] = {0}; + + // Set the required target bitrates for each resolution level. + // If target bitrate for highest-resolution level is set to 0, + // (i.e. target_bitrate[0]=0), we skip encoding at that level. + unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100}; + + // Enter the frame rate of the input video. + const int framerate = 30; + // Set down-sampling factor for each resolution level. + // dsf[0] controls down sampling from level 0 to level 1; + // dsf[1] controls down sampling from level 1 to level 2; + // dsf[2] is not used. + vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}}; + + exec_name = argv[0]; + + if (!encoder) + die("Unsupported codec."); + + // exe_name, input width, input height, input file, + // output file 1, output file 2, output file 3, psnr on/off + if (argc != (5 + kNumEncoders)) + die("Invalid number of input options."); + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + width = strtol(argv[1], NULL, 0); + height = strtol(argv[2], NULL, 0); + + if (width < 16 || width % 2 || height < 16 || height % 2) + die("Invalid resolution: %ldx%ld", width, height); + + // Open input video file for encoding + if (!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading", argv[3]); + + show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0); + + // Populate default encoder configuration + for (i = 0; i < kNumEncoders; ++i) { + vpx_codec_err_t res = + vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0); + if (res != VPX_CODEC_OK) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + return EXIT_FAILURE; } - - /* - * Update the default configuration according to needs of the application. - */ - /* Highest-resolution encoder settings */ - cfg[0].g_w = width; - cfg[0].g_h = height; - cfg[0].g_threads = 1; /* number of threads used */ - cfg[0].rc_dropframe_thresh = 30; - cfg[0].rc_end_usage = VPX_CBR; - cfg[0].rc_resize_allowed = 0; - cfg[0].rc_min_quantizer = 4; - cfg[0].rc_max_quantizer = 56; - cfg[0].rc_undershoot_pct = 98; - cfg[0].rc_overshoot_pct = 100; - cfg[0].rc_buf_initial_sz = 500; - cfg[0].rc_buf_optimal_sz = 600; - cfg[0].rc_buf_sz = 1000; - cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ - cfg[0].g_lag_in_frames = 0; - - /* Disable automatic keyframe placement */ - /* Note: These 3 settings are copied to all levels. But, except the lowest - * resolution level, all other levels are set to VPX_KF_DISABLED internally. - */ - //cfg[0].kf_mode = VPX_KF_DISABLED; - cfg[0].kf_mode = VPX_KF_AUTO; - cfg[0].kf_min_dist = 3000; - cfg[0].kf_max_dist = 3000; - - cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */ - cfg[0].g_timebase.num = 1; /* Set fps */ - cfg[0].g_timebase.den = framerate; - - /* Other-resolution encoder settings */ - for (i=1; i< NUM_ENCODERS; i++) + } + + // Update the default configuration according to needs of the application. + // Highest-resolution encoder settings + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].g_threads = 1; + cfg[0].rc_dropframe_thresh = 30; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 4; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 98; + cfg[0].rc_overshoot_pct = 100; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; + cfg[0].g_lag_in_frames = 0; + cfg[0].kf_mode = VPX_KF_AUTO; // VPX_KF_DISABLED + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + cfg[0].rc_target_bitrate = target_bitrate[0]; + cfg[0].g_timebase.num = 1; + cfg[0].g_timebase.den = framerate; + + // Other-resolution encoder settings + for (i = 1; i < kNumEncoders; ++i) { + cfg[i] = cfg[0]; + cfg[i].g_threads = 1; + cfg[i].rc_target_bitrate = target_bitrate[i]; + + // Note: Width & height of other-resolution encoders are calculated + // from the highest-resolution encoder's size and the corresponding + // down_sampling_factor. { - memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); - - cfg[i].g_threads = 1; /* number of threads used */ - cfg[i].rc_target_bitrate = target_bitrate[i]; - - /* Note: Width & height of other-resolution encoders are calculated - * from the highest-resolution encoder's size and the corresponding - * down_sampling_factor. - */ - { - unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1; - unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1; - cfg[i].g_w = iw/dsf[i-1].num; - cfg[i].g_h = ih/dsf[i-1].num; - } - - /* Make width & height to be multiplier of 2. */ - // Should support odd size ??? - if((cfg[i].g_w)%2)cfg[i].g_w++; - if((cfg[i].g_h)%2)cfg[i].g_h++; + unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1; + unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1; + cfg[i].g_w = iw / dsf[i - 1].num; + cfg[i].g_h = ih / dsf[i - 1].num; } - /* Allocate image for each encoder */ - for (i=0; i< NUM_ENCODERS; i++) - if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) - die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); - - if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w) - read_frame_p = read_frame; - else - read_frame_p = read_frame_by_row; - - for (i=0; i< NUM_ENCODERS; i++) - if(outfile[i]) - write_ivf_file_header(outfile[i], &cfg[i], 0); - - /* Initialize multi-encoder */ - if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS, - (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0])) - die_codec(&codec[0], "Failed to initialize encoder"); - - /* The extra encoding configuration parameters can be set as follows. */ - /* Set encoding speed */ - for ( i=0; i<NUM_ENCODERS; i++) - { - int speed = -6; - if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed)) - die_codec(&codec[i], "Failed to set cpu_used"); + // Make width & height to be multiplier of 2. + if ((cfg[i].g_w) % 2) + cfg[i].g_w++; + + if ((cfg[i].g_h) % 2) + cfg[i].g_h++; + } + + // Open output file for each encoder to output bitstreams + for (i = 0; i < kNumEncoders; ++i) { + VpxVideoInfo info = { + encoder->fourcc, + cfg[i].g_w, + cfg[i].g_h, + {cfg[i].g_timebase.num, cfg[i].g_timebase.den} + }; + + if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info))) + die("Failed to open %s for writing", argv[i+4]); + } + + // Allocate image for each encoder + for (i = 0; i < kNumEncoders; ++i) + if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) + die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + + // Initialize multi-encoder + if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0], + kNumEncoders, + show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0])) + die_codec(&codec[0], "Failed to initialize encoder"); + + // The extra encoding configuration parameters can be set as follows. + for (i = 0; i < kNumEncoders; i++) { + // Set encoding speed + if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6)) + die_codec(&codec[i], "Failed to set cpu_used"); + + // Set static threshold. + if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1)) + die_codec(&codec[i], "Failed to set static threshold"); + + // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING + // Enable denoising for the highest-resolution encoder. + if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0)) + die_codec(&codec[0], "Failed to set noise_sensitivity"); + } + + frame_avail = 1; + got_data = 0; + + while (frame_avail || got_data) { + vpx_codec_iter_t iter[kNumEncoders] = {NULL}; + const vpx_codec_cx_pkt_t *pkt[kNumEncoders]; + + frame_avail = vpx_img_read(&raw[0], infile); + + if (frame_avail) { + for (i = 1; i < kNumEncoders; ++i) { + vpx_image_t *const prev = &raw[i - 1]; + + // Scale the image down a number of times by downsampling factor + // FilterMode 1 or 2 give better psnr than FilterMode 0. + I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y], + prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U], + prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V], + prev->d_w, prev->d_h, + raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y], + raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U], + raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V], + raw[i].d_w, raw[i].d_h, 1); + } } - /* Set static threshold. */ - for ( i=0; i<NUM_ENCODERS; i++) - { - unsigned int static_thresh = 1; - if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh)) - die_codec(&codec[i], "Failed to set static threshold"); + // Encode frame. + if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL, + frame_cnt, 1, 0, arg_deadline)) { + die_codec(&codec[0], "Failed to encode frame"); } - /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */ - /* Enable denoising for the highest-resolution encoder. */ - if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1)) - die_codec(&codec[0], "Failed to set noise_sensitivity"); - for ( i=1; i< NUM_ENCODERS; i++) - { - if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0)) - die_codec(&codec[i], "Failed to set noise_sensitivity"); - } - - - frame_avail = 1; - got_data = 0; - - while(frame_avail || got_data) - { - vpx_codec_iter_t iter[NUM_ENCODERS]={NULL}; - const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS]; - - flags = 0; - frame_avail = read_frame_p(infile, &raw[0]); - - if(frame_avail) - { - for ( i=1; i<NUM_ENCODERS; i++) - { - /*Scale the image down a number of times by downsampling factor*/ - /* FilterMode 1 or 2 give better psnr than FilterMode 0. */ - I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y], - raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U], - raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V], - raw[i-1].d_w, raw[i-1].d_h, - raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y], - raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U], - raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V], - raw[i].d_w, raw[i].d_h, 1); - } - } - - /* Encode each frame at multi-levels */ - if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL, - frame_cnt, 1, flags, arg_deadline)) - die_codec(&codec[0], "Failed to encode frame"); - - for (i=NUM_ENCODERS-1; i>=0 ; i--) - { - got_data = 0; - - while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) ) - { - got_data = 1; - switch(pkt[i]->kind) { - case VPX_CODEC_CX_FRAME_PKT: - write_ivf_frame_header(outfile[i], pkt[i]); - (void) fwrite(pkt[i]->data.frame.buf, 1, - pkt[i]->data.frame.sz, outfile[i]); - break; - case VPX_CODEC_PSNR_PKT: - if (show_psnr) - { - int j; - - psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; - psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; - for (j = 0; j < 4; j++) - { - //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]); - psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; - } - psnr_count[i]++; - } - - break; - default: - break; - } - printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT - && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"."); - fflush(stdout); + for (i = kNumEncoders - 1; i >= 0; i--) { + got_data = 0; + + while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) { + got_data = 1; + switch (pkt[i]->kind) { + case VPX_CODEC_CX_FRAME_PKT: + vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf, + pkt[i]->data.frame.sz, frame_cnt - 1); + break; + case VPX_CODEC_PSNR_PKT: + if (show_psnr) { + int j; + psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; + psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; + for (j = 0; j < 4; j++) + psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; + psnr_count[i]++; } + break; + default: + break; } - frame_cnt++; + printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT && + (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"."); + fflush(stdout); + } + } + frame_cnt++; + } + printf("\n"); + + fclose(infile); + + printf("Processed %d frames.\n", frame_cnt - 1); + for (i = 0; i < kNumEncoders; ++i) { + // Calculate PSNR and print it out + if (show_psnr && psnr_count[i] > 0) { + int j; + double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0, + psnr_sse_total[i]); + + fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); + fprintf(stderr, " %.3lf", ovpsnr); + for (j = 0; j < 4; j++) + fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]); } - printf("\n"); - - fclose(infile); - - printf("Processed %ld frames.\n",(long int)frame_cnt-1); - for (i=0; i< NUM_ENCODERS; i++) - { - /* Calculate PSNR and print it out */ - if ( (show_psnr) && (psnr_count[i]>0) ) - { - int j; - double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0, - psnr_sse_total[i]); - - fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); - - fprintf(stderr, " %.3lf", ovpsnr); - for (j = 0; j < 4; j++) - { - fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]); - } - } - - if(vpx_codec_destroy(&codec[i])) - die_codec(&codec[i], "Failed to destroy codec"); - - vpx_img_free(&raw[i]); - if(!outfile[i]) - continue; + if (vpx_codec_destroy(&codec[i])) + die_codec(&codec[i], "Failed to destroy codec"); - /* Try to rewrite the file header with the actual frame count */ - if(!fseek(outfile[i], 0, SEEK_SET)) - write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1); - fclose(outfile[i]); - } - printf("\n"); + vpx_img_free(&raw[i]); + vpx_video_writer_close(writers[i]); + } + printf("\n"); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/source/libvpx/examples/vp9_spatial_svc_encoder.c b/source/libvpx/examples/vp9_spatial_svc_encoder.c index 223f37e..81d3800 100644 --- a/source/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/source/libvpx/examples/vp9_spatial_svc_encoder.c @@ -38,8 +38,10 @@ static const arg_def_t timebase_arg = ARG_DEF("t", "timebase", 1, "timebase (num/den)"); static const arg_def_t bitrate_arg = ARG_DEF( "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second"); -static const arg_def_t layers_arg = - ARG_DEF("l", "layers", 1, "number of SVC layers"); +static const arg_def_t spatial_layers_arg = + ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers"); +static const arg_def_t temporal_layers_arg = + ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers"); static const arg_def_t kf_dist_arg = ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes"); static const arg_def_t scale_factors_arg = @@ -65,10 +67,11 @@ static const arg_def_t max_bitrate_arg = static const arg_def_t *svc_args[] = { &frames_arg, &width_arg, &height_arg, - &timebase_arg, &bitrate_arg, &skip_frames_arg, &layers_arg, + &timebase_arg, &bitrate_arg, &skip_frames_arg, &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, &quantizers_arg, &passes_arg, &pass_arg, &fpf_name_arg, &min_q_arg, &max_q_arg, - &min_bitrate_arg, &max_bitrate_arg, NULL + &min_bitrate_arg, &max_bitrate_arg, &temporal_layers_arg, + NULL }; static const uint32_t default_frames_to_skip = 0; @@ -79,6 +82,7 @@ static const uint32_t default_timebase_num = 1; static const uint32_t default_timebase_den = 60; static const uint32_t default_bitrate = 1000; static const uint32_t default_spatial_layers = 5; +static const uint32_t default_temporal_layers = 1; static const uint32_t default_kf_dist = 100; typedef struct { @@ -119,6 +123,7 @@ static void parse_command_line(int argc, const char **argv_, // initialize SvcContext with parameters that will be passed to vpx_svc_init svc_ctx->log_level = SVC_LOG_DEBUG; svc_ctx->spatial_layers = default_spatial_layers; + svc_ctx->temporal_layers = default_temporal_layers; // start with default encoder configuration res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0); @@ -156,8 +161,10 @@ static void parse_command_line(int argc, const char **argv_, enc_cfg->rc_target_bitrate = arg_parse_uint(&arg); } else if (arg_match(&arg, &skip_frames_arg, argi)) { app_input->frames_to_skip = arg_parse_uint(&arg); - } else if (arg_match(&arg, &layers_arg, argi)) { + } else if (arg_match(&arg, &spatial_layers_arg, argi)) { svc_ctx->spatial_layers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &temporal_layers_arg, argi)) { + svc_ctx->temporal_layers = arg_parse_uint(&arg); } else if (arg_match(&arg, &kf_dist_arg, argi)) { enc_cfg->kf_min_dist = arg_parse_uint(&arg); enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; diff --git a/source/libvpx/examples/vpx_temporal_svc_encoder.c b/source/libvpx/examples/vpx_temporal_svc_encoder.c index 4ec1848..5eac92c 100644 --- a/source/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/source/libvpx/examples/vpx_temporal_svc_encoder.c @@ -12,6 +12,7 @@ // encoding scheme based on temporal scalability for video applications // that benefit from a scalable bitstream. +#include <assert.h> #include <math.h> #include <stdio.h> #include <stdlib.h> @@ -438,7 +439,7 @@ static void set_temporal_layer_pattern(int layering_mode, } int main(int argc, char **argv) { - VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS]; + VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL}; vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t cfg; int frame_cnt = 0; @@ -456,7 +457,6 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = {0}; int flag_periodicity = 1; - int max_intra_size_pct; vpx_svc_layer_id_t layer_id = {0, 0}; const VpxInterface *encoder = NULL; FILE *infile = NULL; @@ -570,6 +570,8 @@ int main(int argc, char **argv) { outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info); if (!outfile[i]) die("Failed to open %s for writing", file_name); + + assert(outfile[i] != NULL); } // No spatial layers in this encoder. cfg.ss_number_layers = 1; @@ -595,11 +597,11 @@ int main(int argc, char **argv) { // This controls the maximum target size of the key frame. // For generating smaller key frames, use a smaller max_intra_size_pct // value, like 100 or 200. - max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5) - * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0); - // For low-quality key frame. - max_intra_size_pct = 200; - vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); + { + const int max_intra_size_pct = 200; + vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, + max_intra_size_pct); + } frame_avail = 1; while (frame_avail || got_data) { diff --git a/source/libvpx/libs.mk b/source/libvpx/libs.mk index 25fbc2c..c7c2748 100644 --- a/source/libvpx/libs.mk +++ b/source/libvpx/libs.mk @@ -133,6 +133,8 @@ ifeq ($(CONFIG_VP9_DECODER),yes) CODEC_DOC_SECTIONS += vp9 vp9_decoder endif +VP9_PREFIX=vp9/ +$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra ifeq ($(CONFIG_ENCODERS),yes) CODEC_DOC_SECTIONS += encoder diff --git a/source/libvpx/test/active_map_test.cc b/source/libvpx/test/active_map_test.cc index a9bb540..0221995 100644 --- a/source/libvpx/test/active_map_test.cc +++ b/source/libvpx/test/active_map_test.cc @@ -38,7 +38,7 @@ class ActiveMapTest if (video->frame() == 1) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); } else if (video->frame() == 3) { - vpx_active_map_t map = {0}; + vpx_active_map_t map = vpx_active_map_t(); uint8_t active_map[9 * 13] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, @@ -57,7 +57,7 @@ class ActiveMapTest map.active_map = active_map; encoder->Control(VP8E_SET_ACTIVEMAP, &map); } else if (video->frame() == 15) { - vpx_active_map_t map = {0}; + vpx_active_map_t map = vpx_active_map_t(); map.cols = (kWidth + 15) / 16; map.rows = (kHeight + 15) / 16; map.active_map = NULL; diff --git a/source/libvpx/test/datarate_test.cc b/source/libvpx/test/datarate_test.cc index 8dcf26c..a3d730a 100644 --- a/source/libvpx/test/datarate_test.cc +++ b/source/libvpx/test/datarate_test.cc @@ -42,6 +42,9 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; @@ -120,9 +123,40 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, double file_datarate_; double effective_datarate_; size_t bits_in_last_frame_; + int denoiser_on_; }; +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestLarge, DenoiserLevels) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + for (int j = 1; j < 5; ++j) { + // Run over the denoiser levels. + // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j + // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, + // denoiserOnAggressive, and denoiserOnAdaptive. + // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j + // refers to the blur thresholds: 20, 40, 60 80. + // The j = 0 case (denoiser off) is covered in the tests below. + denoiser_on_ = j; + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + << " The datarate for the file missed the target!"; + } +} + TEST_P(DatarateTestLarge, BasicBufferModel) { + denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; cfg_.rc_dropframe_thresh = 1; cfg_.rc_max_quantizer = 56; @@ -154,6 +188,7 @@ TEST_P(DatarateTestLarge, BasicBufferModel) { } TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { + denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; cfg_.rc_max_quantizer = 36; cfg_.rc_end_usage = VPX_CBR; diff --git a/source/libvpx/test/dct16x16_test.cc b/source/libvpx/test/dct16x16_test.cc index ee417ce..c38cc2e 100644 --- a/source/libvpx/test/dct16x16_test.cc +++ b/source/libvpx/test/dct16x16_test.cc @@ -268,11 +268,13 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param; -void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, + int /*tx_type*/) { vp9_fdct16x16_c(in, out, stride); } -void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) { +void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, + int /*tx_type*/) { vp9_idct16x16_256_add_c(in, dest, stride); } diff --git a/source/libvpx/test/dct32x32_test.cc b/source/libvpx/test/dct32x32_test.cc index 4f34a44..d2d437c 100644 --- a/source/libvpx/test/dct32x32_test.cc +++ b/source/libvpx/test/dct32x32_test.cc @@ -37,7 +37,7 @@ static int round(double x) { const int kNumCoeffs = 1024; const double kPi = 3.141592653589793238462643383279502884; -void reference_32x32_dct_1d(const double in[32], double out[32], int stride) { +void reference_32x32_dct_1d(const double in[32], double out[32]) { const double kInvSqrt2 = 0.707106781186547524400844362104; for (int k = 0; k < 32; k++) { out[k] = 0.0; @@ -55,7 +55,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], double temp_in[32], temp_out[32]; for (int j = 0; j < 32; ++j) temp_in[j] = input[j*32 + i]; - reference_32x32_dct_1d(temp_in, temp_out, 1); + reference_32x32_dct_1d(temp_in, temp_out); for (int j = 0; j < 32; ++j) output[j * 32 + i] = temp_out[j]; } @@ -64,7 +64,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], double temp_in[32], temp_out[32]; for (int j = 0; j < 32; ++j) temp_in[j] = output[j + i*32]; - reference_32x32_dct_1d(temp_in, temp_out, 1); + reference_32x32_dct_1d(temp_in, temp_out); // Scale by some magic number for (int j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j] / 4; diff --git a/source/libvpx/test/decode_perf_test.cc b/source/libvpx/test/decode_perf_test.cc index 11529b3..5a71140 100644 --- a/source/libvpx/test/decode_perf_test.cc +++ b/source/libvpx/test/decode_perf_test.cc @@ -74,7 +74,7 @@ TEST_P(DecodePerfTest, PerfTest) { libvpx_test::WebMVideoSource video(video_name); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = threads; libvpx_test::VP9Decoder decoder(cfg, 0); diff --git a/source/libvpx/test/decode_test_driver.cc b/source/libvpx/test/decode_test_driver.cc index 99610eb..0ef4f7b 100644 --- a/source/libvpx/test/decode_test_driver.cc +++ b/source/libvpx/test/decode_test_driver.cc @@ -106,7 +106,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, } void DecoderTest::RunLoop(CompressedVideoSource *video) { - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); RunLoop(video, dec_cfg); } diff --git a/source/libvpx/test/decode_test_driver.h b/source/libvpx/test/decode_test_driver.h index 1f73c7d..a757b59 100644 --- a/source/libvpx/test/decode_test_driver.h +++ b/source/libvpx/test/decode_test_driver.h @@ -125,20 +125,20 @@ class DecoderTest { const vpx_codec_dec_cfg_t &dec_cfg); // Hook to be called before decompressing every frame. - virtual void PreDecodeFrameHook(const CompressedVideoSource& video, - Decoder *decoder) {} + virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/, + Decoder* /*decoder*/) {} // Hook to be called to handle decode result. Return true to continue. virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const CompressedVideoSource& /* video */, + const CompressedVideoSource& /*video*/, Decoder *decoder) { EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); return VPX_CODEC_OK == res_dec; } // Hook to be called on every decompressed frame. - virtual void DecompressedFrameHook(const vpx_image_t& img, - const unsigned int frame_number) {} + virtual void DecompressedFrameHook(const vpx_image_t& /*img*/, + const unsigned int /*frame_number*/) {} // Hook to be called on peek result virtual void HandlePeekResult(Decoder* const decoder, diff --git a/source/libvpx/test/encode_test_driver.cc b/source/libvpx/test/encode_test_driver.cc index 6d4281d..9702ddf 100644 --- a/source/libvpx/test/encode_test_driver.cc +++ b/source/libvpx/test/encode_test_driver.cc @@ -133,13 +133,13 @@ static bool compare_img(const vpx_image_t *img1, return match; } -void EncoderTest::MismatchHook(const vpx_image_t *img1, - const vpx_image_t *img2) { +void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/, + const vpx_image_t* /*img2*/) { ASSERT_TRUE(0) << "Encode/Decode mismatch found"; } void EncoderTest::RunLoop(VideoSource *video) { - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); stats_.Reset(); diff --git a/source/libvpx/test/encode_test_driver.h b/source/libvpx/test/encode_test_driver.h index 2270ce2..a77bd64 100644 --- a/source/libvpx/test/encode_test_driver.h +++ b/source/libvpx/test/encode_test_driver.h @@ -189,20 +189,21 @@ class EncoderTest { virtual void RunLoop(VideoSource *video); // Hook to be called at the beginning of a pass. - virtual void BeginPassHook(unsigned int pass) {} + virtual void BeginPassHook(unsigned int /*pass*/) {} // Hook to be called at the end of a pass. virtual void EndPassHook() {} // Hook to be called before encoding a frame. - virtual void PreEncodeFrameHook(VideoSource *video) {} - virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {} + virtual void PreEncodeFrameHook(VideoSource* /*video*/) {} + virtual void PreEncodeFrameHook(VideoSource* /*video*/, + Encoder* /*encoder*/) {} // Hook to be called on every compressed data packet. - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {} + virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {} // Hook to be called on every PSNR packet. - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {} + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {} // Hook to determine whether the encode loop should continue. virtual bool Continue() const { @@ -218,19 +219,19 @@ class EncoderTest { const vpx_image_t *img2); // Hook to be called on every decompressed frame. - virtual void DecompressedFrameHook(const vpx_image_t& img, - vpx_codec_pts_t pts) {} + virtual void DecompressedFrameHook(const vpx_image_t& /*img*/, + vpx_codec_pts_t /*pts*/) {} // Hook to be called to handle decode result. Return true to continue. virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const VideoSource& /* video */, + const VideoSource& /*video*/, Decoder *decoder) { EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); return VPX_CODEC_OK == res_dec; } // Hook that can modify the encoder's output data - virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook( + virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( const vpx_codec_cx_pkt_t *pkt) { return pkt; } diff --git a/source/libvpx/test/external_frame_buffer_test.cc b/source/libvpx/test/external_frame_buffer_test.cc index fb0449d..44eba33 100644 --- a/source/libvpx/test/external_frame_buffer_test.cc +++ b/source/libvpx/test/external_frame_buffer_test.cc @@ -285,7 +285,7 @@ class ExternalFrameBufferTest : public ::testing::Test { video_->Init(); video_->Begin(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); ASSERT_TRUE(decoder_ != NULL); } diff --git a/source/libvpx/test/fdct4x4_test.cc b/source/libvpx/test/fdct4x4_test.cc index 7c48260..08a69ab 100644 --- a/source/libvpx/test/fdct4x4_test.cc +++ b/source/libvpx/test/fdct4x4_test.cc @@ -40,7 +40,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param; -void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fdct4x4_c(in, out, stride); } @@ -48,7 +48,7 @@ void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_fht4x4_c(in, out, stride, tx_type); } -void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fwht4x4_c(in, out, stride); } diff --git a/source/libvpx/test/fdct8x8_test.cc b/source/libvpx/test/fdct8x8_test.cc index 567e5f6..a694f0c 100644 --- a/source/libvpx/test/fdct8x8_test.cc +++ b/source/libvpx/test/fdct8x8_test.cc @@ -39,7 +39,7 @@ typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride, typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param; typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param; -void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) { vp9_fdct8x8_c(in, out, stride); } diff --git a/source/libvpx/test/frame_size_tests.cc b/source/libvpx/test/frame_size_tests.cc index db27975..1c9a522 100644 --- a/source/libvpx/test/frame_size_tests.cc +++ b/source/libvpx/test/frame_size_tests.cc @@ -27,7 +27,7 @@ class VP9FrameSizeTestsLarge } virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource &video, + const libvpx_test::VideoSource& /*video*/, libvpx_test::Decoder *decoder) { EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); diff --git a/source/libvpx/test/intrapred_test.cc b/source/libvpx/test/intrapred_test.cc index ead4760..f0d9c34 100644 --- a/source/libvpx/test/intrapred_test.cc +++ b/source/libvpx/test/intrapred_test.cc @@ -294,6 +294,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest, ::testing::Values( vp8_build_intra_predictors_mby_s_ssse3)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest, + ::testing::Values( + vp8_build_intra_predictors_mby_s_neon)); +#endif typedef void (*IntraPredUvFunc)(MACROBLOCKD *x, uint8_t *uabove_row, @@ -382,5 +387,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest, ::testing::Values( vp8_build_intra_predictors_mbuv_s_ssse3)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest, + ::testing::Values( + vp8_build_intra_predictors_mbuv_s_neon)); +#endif } // namespace diff --git a/source/libvpx/test/invalid_file_test.cc b/source/libvpx/test/invalid_file_test.cc index 0a1c17c..50e7c23 100644 --- a/source/libvpx/test/invalid_file_test.cc +++ b/source/libvpx/test/invalid_file_test.cc @@ -73,7 +73,7 @@ class InvalidFileTest void RunTest() { const DecodeParam input = GET_PARAM(1); libvpx_test::CompressedVideoSource *video = NULL; - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = input.threads; const std::string filename = input.filename; @@ -113,9 +113,12 @@ TEST_P(InvalidFileTest, ReturnCode) { const DecodeParam kVP9InvalidFileTests[] = { {1, "invalid-vp90-02-v2.webm"}, {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"}, - {1, "invalid-vp90-03-v2.webm"}, + {1, "invalid-vp90-03-v3.webm"}, {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"}, {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"}, + {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"}, + {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"}, + {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf"}, }; VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, @@ -126,9 +129,9 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, class InvalidFileInvalidPeekTest : public InvalidFileTest { protected: InvalidFileInvalidPeekTest() : InvalidFileTest() {} - virtual void HandlePeekResult(libvpx_test::Decoder *const decoder, - libvpx_test::CompressedVideoSource *video, - const vpx_codec_err_t res_peek) {} + virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource* /*video*/, + const vpx_codec_err_t /*res_peek*/) {} }; TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { @@ -144,6 +147,10 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"}, + {4, "invalid-" + "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"}, + {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf"}, + {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf"}, }; INSTANTIATE_TEST_CASE_P( diff --git a/source/libvpx/test/md5_helper.h b/source/libvpx/test/md5_helper.h index dc95582..1db712b 100644 --- a/source/libvpx/test/md5_helper.h +++ b/source/libvpx/test/md5_helper.h @@ -28,7 +28,8 @@ class MD5 { // plane, we never want to round down and thus skip a pixel so if // we are shifting by 1 (chroma_shift) we add 1 before doing the shift. // This works only for chroma_shift of 0 and 1. - const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1; + const int bytes_per_sample = + (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; const int h = plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift : img->d_h; const int w = (plane ? (img->d_w + img->x_chroma_shift) >> diff --git a/source/libvpx/test/resize_test.cc b/source/libvpx/test/resize_test.cc index 8d08f1e..9d0c570 100644 --- a/source/libvpx/test/resize_test.cc +++ b/source/libvpx/test/resize_test.cc @@ -211,8 +211,8 @@ class ResizeInternalTest : public ResizeTest { EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { #if WRITE_COMPRESSED_STREAM + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { ++out_frames_; // Write initial file header if first frame. @@ -222,8 +222,8 @@ class ResizeInternalTest : public ResizeTest { // Write frame header and data. write_ivf_frame_header(pkt, outfile_); (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); -#endif } +#endif double frame0_psnr_; #if WRITE_COMPRESSED_STREAM diff --git a/source/libvpx/test/sad_test.cc b/source/libvpx/test/sad_test.cc index e63770b..5377c1e 100644 --- a/source/libvpx/test/sad_test.cc +++ b/source/libvpx/test/sad_test.cc @@ -505,21 +505,6 @@ const SadMxNParam mmx_tests[] = { INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #endif // CONFIG_VP8_ENCODER -#if CONFIG_VP9_ENCODER -const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx; -const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx; -const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx; -const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx; -const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx; -const SadMxNVp9Param mmx_vp9_tests[] = { - make_tuple(16, 16, sad_16x16_mmx_vp9), - make_tuple(8, 16, sad_8x16_mmx_vp9), - make_tuple(16, 8, sad_16x8_mmx_vp9), - make_tuple(8, 8, sad_8x8_mmx_vp9), - make_tuple(4, 4, sad_4x4_mmx_vp9), -}; -INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests)); -#endif // CONFIG_VP9_ENCODER #endif // HAVE_MMX #if HAVE_SSE diff --git a/source/libvpx/test/svc_test.cc b/source/libvpx/test/svc_test.cc index 1cb01a4..218f53d 100644 --- a/source/libvpx/test/svc_test.cc +++ b/source/libvpx/test/svc_test.cc @@ -60,7 +60,7 @@ class SvcTest : public ::testing::Test { codec_enc_.kf_min_dist = 100; codec_enc_.kf_max_dist = 100; - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP9CodecFactory codec_factory; decoder_ = codec_factory.CreateDecoder(dec_cfg, 0); } @@ -112,7 +112,7 @@ class SvcTest : public ::testing::Test { video.Next(); } - // Flush encoder and test EOS packet + // Flush encoder and test EOS packet. res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(), VPX_DL_GOOD_QUALITY); stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_); @@ -135,7 +135,7 @@ class SvcTest : public ::testing::Test { EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_)); } - outputs[*frame_received].buf = malloc(frame_size); + outputs[*frame_received].buf = malloc(frame_size + 16); ASSERT_TRUE(outputs[*frame_received].buf != NULL); memcpy(outputs[*frame_received].buf, vpx_svc_get_buffer(&svc_), frame_size); @@ -176,13 +176,13 @@ class SvcTest : public ::testing::Test { video.Next(); } - // Flush Encoder + // Flush encoder. res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(), VPX_DL_GOOD_QUALITY); EXPECT_EQ(VPX_CODEC_OK, res); StoreFrames(n, outputs, &frame_received); - EXPECT_EQ(frame_received, (size_t)n); + EXPECT_EQ(frame_received, static_cast<size_t>(n)); ReleaseEncoder(); } @@ -204,7 +204,7 @@ class SvcTest : public ::testing::Test { ++decoded_frames; DxDataIterator dec_iter = decoder_->GetDxData(); - while (dec_iter.Next()) { + while (dec_iter.Next() != NULL) { ++received_frames; } } @@ -212,12 +212,13 @@ class SvcTest : public ::testing::Test { EXPECT_EQ(received_frames, n); } - void DropEnhancementLayers(struct vpx_fixed_buf *const inputs, - const int num_super_frames, - const int remained_layers) { + void DropLayersAndMakeItVP9Comaptible(struct vpx_fixed_buf *const inputs, + const int num_super_frames, + const int remained_spatial_layers, + const bool is_multiple_frame_contexts) { ASSERT_TRUE(inputs != NULL); ASSERT_GT(num_super_frames, 0); - ASSERT_GT(remained_layers, 0); + ASSERT_GT(remained_spatial_layers, 0); for (int i = 0; i < num_super_frames; ++i) { uint32_t frame_sizes[8] = {0}; @@ -233,34 +234,112 @@ class SvcTest : public ::testing::Test { NULL, NULL); ASSERT_EQ(VPX_CODEC_OK, res); - uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf); - uint8_t *frame_start = frame_data; - for (frame = 0; frame < frame_count; ++frame) { - // Looking for a visible frame - if (frame_data[0] & 0x02) { - ++frames_found; - if (frames_found == remained_layers) - break; + if (frame_count == 0) { + // There's no super frame but only a single frame. + ASSERT_EQ(1, remained_spatial_layers); + if (is_multiple_frame_contexts) { + // Make a new super frame. + uint8_t marker = 0xc1; + unsigned int mask; + int mag; + + // Choose the magnitude. + for (mag = 0, mask = 0xff; mag < 4; ++mag) { + if (inputs[i].sz < mask) + break; + mask <<= 8; + mask |= 0xff; + } + marker |= mag << 3; + int index_sz = 2 + (mag + 1) * 2; + + inputs[i].buf = realloc(inputs[i].buf, inputs[i].sz + index_sz + 16); + ASSERT_TRUE(inputs[i].buf != NULL); + uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf); + frame_data[0] &= ~2; // Set the show_frame flag to 0. + frame_data += inputs[i].sz; + // Add an one byte frame with show_existing_frame. + *frame_data++ = 0x88; + + // Write the super frame index. + *frame_data++ = marker; + + frame_sizes[0] = inputs[i].sz; + frame_sizes[1] = 1; + for (int j = 0; j < 2; ++j) { + unsigned int this_sz = frame_sizes[j]; + for (int k = 0; k <= mag; k++) { + *frame_data++ = this_sz & 0xff; + this_sz >>= 8; + } + } + *frame_data++ = marker; + inputs[i].sz += index_sz + 1; } + } else { + // Found a super frame. + uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf); + uint8_t *frame_start = frame_data; + for (frame = 0; frame < frame_count; ++frame) { + // Looking for a visible frame. + if (frame_data[0] & 0x02) { + ++frames_found; + if (frames_found == remained_spatial_layers) + break; + } + frame_data += frame_sizes[frame]; + } + ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. " + << "remained_spatial_layers: " << remained_spatial_layers + << " super_frame: " << i + << " is_multiple_frame_context: " << is_multiple_frame_contexts; + if (frame == frame_count - 1 && !is_multiple_frame_contexts) + continue; + frame_data += frame_sizes[frame]; + + // We need to add one more frame for multiple frame contexts. + if (is_multiple_frame_contexts) + ++frame; + uint8_t marker = + static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1]; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frame_count; + const size_t new_index_sz = 2 + mag * (frame + 1); + marker &= 0x0f8; + marker |= frame; + + // Copy existing frame sizes. + memmove(frame_data + (is_multiple_frame_contexts ? 2 : 1), + frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2); + if (is_multiple_frame_contexts) { + // Add a one byte frame with flag show_existing_frame. + *frame_data++ = 0x88 | (remained_spatial_layers - 1); + } + // New marker. + frame_data[0] = marker; + frame_data += (mag * (frame + 1) + 1); + + if (is_multiple_frame_contexts) { + // Write the frame size for the one byte frame. + frame_data -= mag; + *frame_data++ = 1; + for (uint32_t j = 1; j < mag; ++j) { + *frame_data++ = 0; + } + } + + *frame_data++ = marker; + inputs[i].sz = frame_data - frame_start; + + if (is_multiple_frame_contexts) { + // Change the show frame flag to 0 for all frames. + for (int j = 0; j < frame; ++j) { + frame_start[0] &= ~2; + frame_start += frame_sizes[j]; + } + } } - ASSERT_LT(frame, frame_count); - if (frame == frame_count - 1) - continue; - - frame_data += frame_sizes[frame]; - uint8_t marker = - static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1]; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const size_t index_sz = 2 + mag * frame_count; - const size_t new_index_sz = 2 + mag * (frame + 1); - marker &= 0x0f8; - marker |= frame; - frame_data[0] = marker; - memcpy(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1, - new_index_sz - 2); - frame_data[new_index_sz - 1] = marker; - inputs[i].sz = frame_data - frame_start + new_index_sz; } } @@ -326,7 +405,7 @@ TEST_F(SvcTest, InvalidOptions) { } TEST_F(SvcTest, SetLayersOption) { - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3"); + vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3"); EXPECT_EQ(VPX_CODEC_OK, res); InitializeEncoder(); EXPECT_EQ(3, svc_.spatial_layers); @@ -334,7 +413,7 @@ TEST_F(SvcTest, SetLayersOption) { TEST_F(SvcTest, SetMultipleOptions) { vpx_codec_err_t res = - vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3"); + vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3"); EXPECT_EQ(VPX_CODEC_OK, res); InitializeEncoder(); EXPECT_EQ(2, svc_.spatial_layers); @@ -496,7 +575,7 @@ TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) { FreeBitstreamBuffers(&outputs[0], 20); } -TEST_F(SvcTest, TwoPassEncode2LayersDecodeBaseLayerOnly) { +TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) { // First pass encode std::string stats_buf; Pass1EncodeNFrames(10, 2, &stats_buf); @@ -507,12 +586,12 @@ TEST_F(SvcTest, TwoPassEncode2LayersDecodeBaseLayerOnly) { vpx_fixed_buf outputs[10]; memset(&outputs[0], 0, sizeof(outputs)); Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DropEnhancementLayers(&outputs[0], 10, 1); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false); DecodeNFrames(&outputs[0], 10); FreeBitstreamBuffers(&outputs[0], 10); } -TEST_F(SvcTest, TwoPassEncode5LayersDecode54321Layers) { +TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) { // First pass encode std::string stats_buf; Pass1EncodeNFrames(10, 5, &stats_buf); @@ -525,13 +604,13 @@ TEST_F(SvcTest, TwoPassEncode5LayersDecode54321Layers) { Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]); DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 4); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 4, false); DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 3); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 3, false); DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 2); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, false); DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 1); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false); DecodeNFrames(&outputs[0], 10); FreeBitstreamBuffers(&outputs[0], 10); @@ -568,12 +647,212 @@ TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) { memset(&outputs[0], 0, sizeof(outputs)); Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]); DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 2); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 2, false); DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 1); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 1, false); DecodeNFrames(&outputs[0], 20); FreeBitstreamBuffers(&outputs[0], 20); } +TEST_F(SvcTest, SetMultipleFrameContextsOption) { + svc_.spatial_layers = 5; + vpx_codec_err_t res = + vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); + EXPECT_EQ(VPX_CODEC_OK, res); + res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); + + svc_.spatial_layers = 2; + res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); + InitializeEncoder(); +} + +TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) { + // First pass encode + std::string stats_buf; + Pass1EncodeNFrames(10, 2, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true); + DecodeNFrames(&outputs[0], 10); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, + TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) { + // First pass encode + std::string stats_buf; + Pass1EncodeNFrames(10, 2, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true); + DecodeNFrames(&outputs[0], 10); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1"); + Pass1EncodeNFrames(10, 2, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 " + "multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true); + DecodeNFrames(&outputs[0], 10); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, + TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1"); + Pass1EncodeNFrames(10, 3, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 " + "multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]); + + vpx_fixed_buf outputs_new[10]; + for (int i = 0; i < 10; ++i) { + outputs_new[i].buf = malloc(outputs[i].sz + 16); + ASSERT_TRUE(outputs_new[i].buf != NULL); + memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz); + outputs_new[i].sz = outputs[i].sz; + } + DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 3, true); + DecodeNFrames(&outputs_new[0], 10); + + for (int i = 0; i < 10; ++i) { + memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz); + outputs_new[i].sz = outputs[i].sz; + } + DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 2, true); + DecodeNFrames(&outputs_new[0], 10); + + for (int i = 0; i < 10; ++i) { + memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz); + outputs_new[i].sz = outputs[i].sz; + } + DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 1, true); + DecodeNFrames(&outputs_new[0], 10); + + FreeBitstreamBuffers(&outputs[0], 10); + FreeBitstreamBuffers(&outputs_new[0], 10); +} + +TEST_F(SvcTest, TwoPassEncode2TemporalLayers) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1"); + svc_.temporal_layers = 2; + Pass1EncodeNFrames(10, 1, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + svc_.temporal_layers = 2; + vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); + DecodeNFrames(&outputs[0], 10); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1"); + svc_.temporal_layers = 2; + Pass1EncodeNFrames(10, 1, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + svc_.temporal_layers = 2; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 " + "multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true); + DecodeNFrames(&outputs[0], 10); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1"); + svc_.temporal_layers = 2; + Pass1EncodeNFrames(10, 1, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + svc_.temporal_layers = 2; + vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); + + vpx_fixed_buf base_layer[5]; + for (int i = 0; i < 5; ++i) + base_layer[i] = outputs[i * 2]; + + DecodeNFrames(&base_layer[0], 5); + FreeBitstreamBuffers(&outputs[0], 10); +} + +TEST_F(SvcTest, + TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) { + // First pass encode + std::string stats_buf; + vpx_svc_set_options(&svc_, "scale-factors=1/1"); + svc_.temporal_layers = 2; + Pass1EncodeNFrames(10, 1, &stats_buf); + + // Second pass encode + codec_enc_.g_pass = VPX_RC_LAST_PASS; + svc_.temporal_layers = 2; + codec_enc_.g_error_resilient = 0; + vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 " + "multi-frame-contexts=1"); + vpx_fixed_buf outputs[10]; + memset(&outputs[0], 0, sizeof(outputs)); + Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); + DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true); + + vpx_fixed_buf base_layer[5]; + for (int i = 0; i < 5; ++i) + base_layer[i] = outputs[i * 2]; + + DecodeNFrames(&base_layer[0], 5); + FreeBitstreamBuffers(&outputs[0], 10); +} + } // namespace diff --git a/source/libvpx/test/test-data.sha1 b/source/libvpx/test/test-data.sha1 index ee6289f..84b13f9 100644 --- a/source/libvpx/test/test-data.sha1 +++ b/source/libvpx/test/test-data.sha1 @@ -10,8 +10,8 @@ fe346136b9b8c1e6f6084cc106485706915795e4 invalid-vp90-01-v2.webm 25751f5d3b05ff03f0719ad42cd625348eb8961e invalid-vp90-01-v2.webm.res d78e2fceba5ac942246503ec8366f879c4775ca5 invalid-vp90-02-v2.webm 8e2eff4af87d2b561cce2365713269e301457ef3 invalid-vp90-02-v2.webm.res -df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v2.webm -25dd58c22d23f75304d7ce7f69f4e5b02ef9119a invalid-vp90-03-v2.webm.res +df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v3.webm +4935c62becc68c13642a03db1e6d3e2331c1c612 invalid-vp90-03-v3.webm.res d637297561dd904eb2c97a9015deeb31c4a1e8d2 invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm 3a204bdbeaa3c6458b77bcebb8366d107267f55d invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res a432f96ff0a787268e2f94a8092ab161a18d1b06 park_joy_90p_10_420.y4m @@ -681,3 +681,19 @@ e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5 717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m +b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf +0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res +fac89b5735be8a86b0dc05159f996a5c3208ae32 invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf +22e0ee8babe574722baf4ef6d7ff5d7cf80d386c invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res +4506dfdcdf8ee4250924b075a0dcf1f070f72e5a invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf +d3ea592c8d7b05d14c7ed48befc0a3aaf7709b7a invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res +65e93f9653bcf65b022f7d225268d1a90a76e7bb vp90-2-19-skip.webm +368dccdde5288c13c25695d2eacdc7402cadf613 vp90-2-19-skip.webm.md5 +ffe460282df2b0e7d4603c2158653ad96f574b02 vp90-2-19-skip-01.webm +bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 vp90-2-19-skip-01.webm.md5 +b03c408cf23158638da18dbc3323b99a1635c68a invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf +0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res +5e67e24e7f53fd189e565513cef8519b1bd6c712 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf +741158f67c0d9d23726624d06bdc482ad368afc9 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res +8b1f7bf7e86c0976d277f60e8fcd9539e75a079a invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf +fb79dcbbbb8c82d5a750e339acce66e39a32f15f invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res diff --git a/source/libvpx/test/test.mk b/source/libvpx/test/test.mk index 0814c2b..c839c92 100644 --- a/source/libvpx/test/test.mk +++ b/source/libvpx/test/test.mk @@ -785,6 +785,10 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5 @@ -793,16 +797,28 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # BBB VP9 streams diff --git a/source/libvpx/test/test_vectors.cc b/source/libvpx/test/test_vectors.cc index dbdbdd6..cccebf8 100644 --- a/source/libvpx/test/test_vectors.cc +++ b/source/libvpx/test/test_vectors.cc @@ -181,7 +181,8 @@ const char *const kVP9TestVectors[] = { "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm", "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm", "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm", - "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm", + "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm", + "vp90-2-19-skip-01.webm", "vp91-2-04-yuv444.webm", }; const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors); #endif // CONFIG_VP9_DECODER diff --git a/source/libvpx/test/tile_independence_test.cc b/source/libvpx/test/tile_independence_test.cc index d714452..b9f879d 100644 --- a/source/libvpx/test/tile_independence_test.cc +++ b/source/libvpx/test/tile_independence_test.cc @@ -29,7 +29,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, md5_inv_order_(), n_tiles_(GET_PARAM(1)) { init_flags_ = VPX_CODEC_USE_PSNR; - vpx_codec_dec_cfg_t cfg; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.w = 704; cfg.h = 144; cfg.threads = 1; diff --git a/source/libvpx/test/user_priv_test.cc b/source/libvpx/test/user_priv_test.cc index 22fce85..8512d88 100644 --- a/source/libvpx/test/user_priv_test.cc +++ b/source/libvpx/test/user_priv_test.cc @@ -47,7 +47,7 @@ string DecodeFile(const string &filename) { libvpx_test::WebMVideoSource video(filename); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); libvpx_test::VP9Decoder decoder(cfg, 0); libvpx_test::MD5 md5; diff --git a/source/libvpx/test/variance_test.cc b/source/libvpx/test/variance_test.cc index 7d81182..f76402e 100644 --- a/source/libvpx/test/variance_test.cc +++ b/source/libvpx/test/variance_test.cc @@ -35,6 +35,14 @@ using ::std::tr1::make_tuple; using ::std::tr1::tuple; using libvpx_test::ACMRandom; +static unsigned int mb_ss_ref(const int16_t *src) { + unsigned int res = 0; + for (int i = 0; i < 256; ++i) { + res += src[i] * src[i]; + } + return res; +} + static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, int l2w, int l2h, unsigned int *sse_ptr) { int se = 0; @@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, return sse - (((int64_t) se * se) >> (l2w + l2h)); } +typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); + +class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> { + public: + SumOfSquaresTest() : func_(GetParam()) {} + + virtual ~SumOfSquaresTest() { + libvpx_test::ClearSystemState(); + } + + protected: + void ConstTest(); + void RefTest(); + + SumOfSquaresFunction func_; + ACMRandom rnd_; +}; + +void SumOfSquaresTest::ConstTest() { + int16_t mem[256]; + unsigned int res; + for (int v = 0; v < 256; ++v) { + for (int i = 0; i < 256; ++i) { + mem[i] = v; + } + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(256u * (v * v), res); + } +} + +void SumOfSquaresTest::RefTest() { + int16_t mem[256]; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 256; ++j) { + mem[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const unsigned int expected = mb_ss_ref(mem); + unsigned int res; + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(expected, res); + } +} + template<typename VarianceFunctionType> class VarianceTest : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > { @@ -88,7 +140,7 @@ class VarianceTest height_ = 1 << log2height_; variance_ = get<2>(params); - rnd(ACMRandom::DeterministicSeed()); + rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); ref_ = new uint8_t[block_size_]; @@ -107,7 +159,7 @@ class VarianceTest void RefTest(); void OneQuarterTest(); - ACMRandom rnd; + ACMRandom rnd_; uint8_t* src_; uint8_t* ref_; int width_, log2width_; @@ -135,8 +187,8 @@ template<typename VarianceFunctionType> void VarianceTest<VarianceFunctionType>::RefTest() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size_; j++) { - src_[j] = rnd.Rand8(); - ref_[j] = rnd.Rand8(); + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); } unsigned int sse1, sse2; unsigned int var1; @@ -206,7 +258,7 @@ class SubpelVarianceTest height_ = 1 << log2height_; subpel_variance_ = get<2>(params); - rnd(ACMRandom::DeterministicSeed()); + rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); @@ -226,7 +278,7 @@ class SubpelVarianceTest protected: void RefTest(); - ACMRandom rnd; + ACMRandom rnd_; uint8_t *src_; uint8_t *ref_; uint8_t *sec_; @@ -241,10 +293,10 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { for (int j = 0; j < block_size_; j++) { - src_[j] = rnd.Rand8(); + src_[j] = rnd_.Rand8(); } for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd.Rand8(); + ref_[j] = rnd_.Rand8(); } unsigned int sse1, sse2; unsigned int var1; @@ -263,11 +315,11 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { for (int j = 0; j < block_size_; j++) { - src_[j] = rnd.Rand8(); - sec_[j] = rnd.Rand8(); + src_[j] = rnd_.Rand8(); + sec_[j] = rnd_.Rand8(); } for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd.Rand8(); + ref_[j] = rnd_.Rand8(); } unsigned int sse1, sse2; unsigned int var1; @@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P( namespace vp9 { #if CONFIG_VP9_ENCODER + +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } + +INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, + ::testing::Values(vp9_get_mb_ss_c)); + typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest; typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest; typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest; @@ -485,23 +544,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_c), make_tuple(6, 6, subpel_avg_variance64x64_c))); -#if HAVE_MMX -const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; -const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx; -const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx; -const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx; -const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx; -INSTANTIATE_TEST_CASE_P( - MMX, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_mmx), - make_tuple(3, 3, variance8x8_mmx), - make_tuple(3, 4, variance8x16_mmx), - make_tuple(4, 3, variance16x8_mmx), - make_tuple(4, 4, variance16x16_mmx))); -#endif - #if HAVE_SSE2 #if CONFIG_USE_X86INC + +INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vp9_get_mb_ss_sse2)); + const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; diff --git a/source/libvpx/test/vp8_decrypt_test.cc b/source/libvpx/test/vp8_decrypt_test.cc index 470fdf1..972a1d9 100644 --- a/source/libvpx/test/vp8_decrypt_test.cc +++ b/source/libvpx/test/vp8_decrypt_test.cc @@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp8) { libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf"); video.Init(); - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP8Decoder decoder(dec_cfg, 0); video.Begin(); diff --git a/source/libvpx/test/vp8_multi_resolution_encoder.sh b/source/libvpx/test/vp8_multi_resolution_encoder.sh new file mode 100755 index 0000000..a8b7fe7 --- /dev/null +++ b/source/libvpx/test/vp8_multi_resolution_encoder.sh @@ -0,0 +1,75 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx vp8_multi_resolution_encoder example. To add new +## tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to vp8_mre_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +vp8_multi_resolution_encoder_verify_environment() { + if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + local readonly app="vp8_multi_resolution_encoder" + if [ -z "$(vpx_tool_path "${app}")" ]; then + elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi + fi +} + +# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to +# vp8_multi_resolution_encoder after building path to the executable. +vp8_mre() { + local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull} +} + +vp8_multi_resolution_encoder_three_formats() { + local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + + if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then + if [ "$(vp8_encode_available)" = "yes" ]; then + # Param order: + # Input width + # Input height + # Input file path + # Output file names + # Output PSNR + vp8_mre "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" \ + "${YUV_RAW_INPUT}" \ + ${output_files} \ + 0 + + for output_file in ${output_files}; do + if [ ! -e "${output_file}" ]; then + elog "Missing output file: ${output_file}" + return 1 + fi + done + fi + fi +} + +vp8_mre_tests="vp8_multi_resolution_encoder_three_formats" +run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}" diff --git a/source/libvpx/test/vp9_decrypt_test.cc b/source/libvpx/test/vp9_decrypt_test.cc index 88a3c14..d988612 100644 --- a/source/libvpx/test/vp9_decrypt_test.cc +++ b/source/libvpx/test/vp9_decrypt_test.cc @@ -47,7 +47,7 @@ TEST(TestDecrypt, DecryptWorksVp9) { libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf"); video.Init(); - vpx_codec_dec_cfg_t dec_cfg = {0}; + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); VP9Decoder decoder(dec_cfg, 0); video.Begin(); diff --git a/source/libvpx/test/vp9_thread_test.cc b/source/libvpx/test/vp9_thread_test.cc index d7fc4ee..cc35476 100644 --- a/source/libvpx/test/vp9_thread_test.cc +++ b/source/libvpx/test/vp9_thread_test.cc @@ -163,7 +163,7 @@ string DecodeFile(const string& filename, int num_threads) { libvpx_test::WebMVideoSource video(filename); video.Init(); - vpx_codec_dec_cfg_t cfg = {0}; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); cfg.threads = num_threads; libvpx_test::VP9Decoder decoder(cfg, 0); diff --git a/source/libvpx/test/vpxenc.sh b/source/libvpx/test/vpxenc.sh index b6482c6..9674bdc 100755 --- a/source/libvpx/test/vpxenc.sh +++ b/source/libvpx/test/vpxenc.sh @@ -41,6 +41,40 @@ vpxenc_can_encode_vp9() { fi } +# Echo vpxenc command line parameters allowing use of +# hantro_collage_w352h288.yuv as input. +yuv_input_hantro_collage() { + echo ""${YUV_RAW_INPUT}" + --width="${YUV_RAW_INPUT_WIDTH}" + --height="${YUV_RAW_INPUT_HEIGHT}"" +} + +# Echo default vpxenc real time encoding params. $1 is the codec, which defaults +# to vp8 if unspecified. +vpxenc_rt_params() { + local readonly codec="${1:-vp8}" + echo "--codec=${codec} + --buf-initial-sz=500 + --buf-optimal-sz=600 + --buf-sz=1000 + --cpu-used=-5 + --end-usage=cbr + --error-resilient=1 + --kf-max-dist=90000 + --lag-in-frames=0 + --max-intra-rate=300 + --max-q=56 + --min-q=2 + --noise-sensitivity=0 + --overshoot-pct=50 + --passes=1 + --profile=0 + --resize-allowed=0 + --rt + --static-thresh=0 + --undershoot-pct=50" +} + # Wrapper function for running vpxenc with pipe input. Requires that # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the # input file path and shifted away. All remaining parameters are passed through @@ -59,9 +93,9 @@ vpxenc_pipe() { # shifted away. All remaining parameters are passed through to vpxenc. vpxenc() { local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly input="${1}" + local readonly input="$1" shift - eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" \ + eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \ --test-decode=fatal \ "$@" ${devnull} } @@ -69,13 +103,11 @@ vpxenc() { vpxenc_vp8_ivf() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" - vpxenc --codec=vp8 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ --limit="${TEST_FRAMES}" \ --ivf \ - --output="${output}" \ - "${YUV_RAW_INPUT}" + --output="${output}" if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -88,12 +120,10 @@ vpxenc_vp8_webm() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" - vpxenc --codec=vp8 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ --limit="${TEST_FRAMES}" \ - --output="${output}" \ - "${YUV_RAW_INPUT}" + --output="${output}" if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -102,17 +132,29 @@ vpxenc_vp8_webm() { fi } +vpxenc_vp8_webm_rt() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" + vpxenc $(yuv_input_hantro_collage) \ + $(vpxenc_rt_params vp8) \ + --output="${output}" + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + vpxenc_vp8_webm_2pass() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" - vpxenc --codec=vp8 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ --limit="${TEST_FRAMES}" \ --output="${output}" \ - --passes=2 \ - "${YUV_RAW_INPUT}" + --passes=2 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -127,15 +169,13 @@ vpxenc_vp8_webm_lag10_frames20() { local readonly lag_total_frames=20 local readonly lag_frames=10 local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" - vpxenc --codec=vp8 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ --auto-alt-ref=1 \ - --passes=2 \ - "${YUV_RAW_INPUT}" + --passes=2 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -147,14 +187,11 @@ vpxenc_vp8_webm_lag10_frames20() { vpxenc_vp8_ivf_piped_input() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" - cat "${YUV_RAW_INPUT}" \ - | vpxenc --codec=vp8 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ - --limit="${TEST_FRAMES}" \ - --ivf \ - --output="${output}" \ - - + vpxenc_pipe $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -166,13 +203,11 @@ vpxenc_vp8_ivf_piped_input() { vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ - --output="${output}" \ - "${YUV_RAW_INPUT}" + --output="${output}" if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -185,12 +220,25 @@ vpxenc_vp9_webm() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${TEST_FRAMES}" \ - --output="${output}" \ - "${YUV_RAW_INPUT}" + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm_rt() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" + vpxenc $(yuv_input_hantro_collage) \ + $(vpxenc_rt_params vp9) \ + --output="${output}" if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -203,14 +251,11 @@ vpxenc_vp9_webm_2pass() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${TEST_FRAMES}" \ - --test-decode=fatal \ --output="${output}" \ - --passes=2 \ - "${YUV_RAW_INPUT}" + --passes=2 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -222,14 +267,12 @@ vpxenc_vp9_webm_2pass() { vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ - --lossless=1 \ - "${YUV_RAW_INPUT}" + --lossless=1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -241,15 +284,13 @@ vpxenc_vp9_ivf_lossless() { vpxenc_vp9_ivf_minq0_maxq0() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ --min-q=0 \ - --max-q=0 \ - "${YUV_RAW_INPUT}" + --max-q=0 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -264,16 +305,13 @@ vpxenc_vp9_webm_lag10_frames20() { local readonly lag_total_frames=20 local readonly lag_frames=10 local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" - vpxenc --codec=vp9 \ - --width="${YUV_RAW_INPUT_WIDTH}" \ - --height="${YUV_RAW_INPUT_HEIGHT}" \ + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ - --test-decode=fatal \ --passes=2 \ - --auto-alt-ref=1 \ - "${YUV_RAW_INPUT}" + --auto-alt-ref=1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -284,11 +322,13 @@ vpxenc_vp9_webm_lag10_frames20() { vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm + vpxenc_vp8_webm_rt vpxenc_vp8_webm_2pass vpxenc_vp8_webm_lag10_frames20 vpxenc_vp8_ivf_piped_input vpxenc_vp9_ivf vpxenc_vp9_webm + vpxenc_vp9_webm_rt vpxenc_vp9_webm_2pass vpxenc_vp9_ivf_lossless vpxenc_vp9_ivf_minq0_maxq0 diff --git a/source/libvpx/test/y4m_test.cc b/source/libvpx/test/y4m_test.cc index 17cd782..58a6fe3 100644 --- a/source/libvpx/test/y4m_test.cc +++ b/source/libvpx/test/y4m_test.cc @@ -57,7 +57,7 @@ static void write_image_file(const vpx_image_t *img, FILE *file) { for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1; + const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; const int h = (plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift : img->d_h); const int w = (plane ? (img->d_w + img->x_chroma_shift) >> diff --git a/source/libvpx/third_party/libyuv/README.libvpx b/source/libvpx/third_party/libyuv/README.libvpx index fa5b498..3869d25 100644 --- a/source/libvpx/third_party/libyuv/README.libvpx +++ b/source/libvpx/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1041 +Version: 1060 License: BSD License File: LICENSE @@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: -None. +cherry-pick 'Issue 24479004: Fix building with MSVC for arm' diff --git a/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h index 82fd95d..8423121 100644 --- a/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h +++ b/source/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -153,7 +153,6 @@ class LIBYUV_API MJpegDecoder { int* subsample_x, int* subsample_y, int number_of_components); private: - void AllocOutputBuffers(int num_outbufs); void DestroyOutputBuffers(); diff --git a/source/libvpx/third_party/libyuv/include/libyuv/row.h b/source/libvpx/third_party/libyuv/include/libyuv/row.h index fdfe1ae..4b3c870 100644 --- a/source/libvpx/third_party/libyuv/include/libyuv/row.h +++ b/source/libvpx/third_party/libyuv/include/libyuv/row.h @@ -252,6 +252,94 @@ extern "C" { // The following are available on arm64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// #define HAS_I444TOARGBROW_NEON +// #define HAS_I422TOARGBROW_NEON +// #define HAS_I411TOARGBROW_NEON +// #define HAS_I422TOBGRAROW_NEON +// #define HAS_I422TOABGRROW_NEON +// #define HAS_I422TORGBAROW_NEON +// #define HAS_I422TORGB24ROW_NEON +// #define HAS_I422TORAWROW_NEON +// #define HAS_I422TORGB565ROW_NEON +// #define HAS_I422TOARGB1555ROW_NEON +// #define HAS_I422TOARGB4444ROW_NEON +// #define HAS_YTOARGBROW_NEON +// #define HAS_I400TOARGBROW_NEON +// #define HAS_NV12TOARGBROW_NEON +// #define HAS_NV21TOARGBROW_NEON +// #define HAS_NV12TORGB565ROW_NEON +// #define HAS_NV21TORGB565ROW_NEON +// #define HAS_YUY2TOARGBROW_NEON +// #define HAS_UYVYTOARGBROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_COPYROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROWS_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +// #define HAS_RGB565TOARGBROW_NEON +// #define HAS_ARGB1555TOARGBROW_NEON +// #define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_HALFROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON +// #define HAS_ARGBTORGB565ROW_NEON +// #define HAS_ARGBTOARGB1555ROW_NEON +// #define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +// #define HAS_ARGBTOUV444ROW_NEON +// #define HAS_ARGBTOUV422ROW_NEON +// #define HAS_ARGBTOUV411ROW_NEON +// #define HAS_ARGBTOUVROW_NEON +// #define HAS_ARGBTOUVJROW_NEON +// #define HAS_BGRATOUVROW_NEON +// #define HAS_ABGRTOUVROW_NEON +// #define HAS_RGBATOUVROW_NEON +// #define HAS_RGB24TOUVROW_NEON +// #define HAS_RAWTOUVROW_NEON +// #define HAS_RGB565TOUVROW_NEON +// #define HAS_ARGB1555TOUVROW_NEON +// #define HAS_ARGB4444TOUVROW_NEON +// #define HAS_RGB565TOYROW_NEON +// #define HAS_ARGB1555TOYROW_NEON +// #define HAS_ARGB4444TOYROW_NEON +// #define HAS_BGRATOYROW_NEON +// #define HAS_ABGRTOYROW_NEON +// #define HAS_RGBATOYROW_NEON +// #define HAS_RGB24TOYROW_NEON +// #define HAS_RAWTOYROW_NEON +// #define HAS_INTERPOLATEROW_NEON +// #define HAS_ARGBBLENDROW_NEON +// #define HAS_ARGBATTENUATEROW_NEON +// #define HAS_ARGBQUANTIZEROW_NEON +// #define HAS_ARGBSHADEROW_NEON +// #define HAS_ARGBGRAYROW_NEON +// #define HAS_ARGBSEPIAROW_NEON +// #define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON #endif // The following are available on Neon platforms: @@ -465,7 +553,7 @@ typedef uint8 uvec8[16]; #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" #endif // defined(__native_client__) && defined(__x86_64__) -#if defined(__arm__) +#if defined(__arm__) || defined(__aarch64__) #undef MEMACCESS #if defined(__native_client__) #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" diff --git a/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h index 8dc0762..3c49542 100644 --- a/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h +++ b/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h @@ -51,6 +51,14 @@ extern "C" { #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON +#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__aarch64__) || defined(LIBYUV_NEON)) +/* #define HAS_SCALEROWDOWN2_NEON */ +/* #define HAS_SCALEROWDOWN4_NEON */ +/* #define HAS_SCALEROWDOWN34_NEON */ +/* #define HAS_SCALEROWDOWN38_NEON */ +/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */ +/* #define HAS_SCALEARGBROWDOWN2_NEON */ #endif // The following are available on Mips platforms: diff --git a/source/libvpx/third_party/libyuv/include/libyuv/version.h b/source/libvpx/third_party/libyuv/include/libyuv/version.h index 912c4c9..73a7f1b 100644 --- a/source/libvpx/third_party/libyuv/include/libyuv/version.h +++ b/source/libvpx/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1041 +#define LIBYUV_VERSION 1059 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/libvpx/third_party/libyuv/source/compare.cc b/source/libvpx/third_party/libyuv/source/compare.cc index 9ea81b4..dc715e0 100644 --- a/source/libvpx/third_party/libyuv/source/compare.cc +++ b/source/libvpx/third_party/libyuv/source/compare.cc @@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #endif diff --git a/source/libvpx/third_party/libyuv/source/compare_neon.cc b/source/libvpx/third_party/libyuv/source/compare_neon.cc index 5e7b8e4..55052c0 100644 --- a/source/libvpx/third_party/libyuv/source/compare_neon.cc +++ b/source/libvpx/third_party/libyuv/source/compare_neon.cc @@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { return sse; } +#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "bgt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/libvpx/third_party/libyuv/source/convert.cc b/source/libvpx/third_party/libyuv/source/convert.cc index 874a6cb..a8e294f 100644 --- a/source/libvpx/third_party/libyuv/source/convert.cc +++ b/source/libvpx/third_party/libyuv/source/convert.cc @@ -401,7 +401,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - int halfheight = (height + 1) >> 1; + int halfheight; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUV422Row_C; @@ -711,11 +711,13 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -963,9 +965,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1022,36 +1021,44 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RGB24TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB24TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); - RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB24TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1075,9 +1082,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1134,36 +1138,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RAWTOYROW_NEON - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); - RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); -#else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, 0, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); -#else - RAWToARGBRow(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + for (y = 0; y < height - 1; y += 2) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); + #else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + #endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + #else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + #endif + } + #if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); + #endif } -#if !defined(HAS_RAWTOYROW_NEON) - free_aligned_buffer_64(row); -#endif return 0; } @@ -1187,9 +1197,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1246,36 +1253,44 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RGB565TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB565TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); - RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB565TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1299,9 +1314,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1358,38 +1370,45 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB1555TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB1555TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, - width); + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_argb1555 += src_stride_argb1555 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB1555TOYROW_NEON) free_aligned_buffer_64(row); #endif + } return 0; } @@ -1413,9 +1432,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1472,38 +1488,46 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB4444TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB4444TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, - width); + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_argb4444 += src_stride_argb4444 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB4444TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } diff --git a/source/libvpx/third_party/libyuv/source/convert_from_argb.cc b/source/libvpx/third_party/libyuv/source/convert_from_argb.cc index 121a416..de461dd 100644 --- a/source/libvpx/third_party/libyuv/source/convert_from_argb.cc +++ b/source/libvpx/third_party/libyuv/source/convert_from_argb.cc @@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, #elif defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; - ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; - ARGBToUV444Row = ARGBToUV444Row_NEON; } } #endif @@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) @@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -228,11 +234,13 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 32) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; - if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUV411ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; } } #endif @@ -261,9 +269,6 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -296,11 +301,13 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -331,22 +338,27 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -364,9 +376,6 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -399,11 +408,13 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -434,22 +445,27 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -493,6 +509,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -510,12 +533,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -594,6 +611,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { @@ -611,12 +635,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -1022,11 +1040,13 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } - if (width >= 16) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; } } #endif diff --git a/source/libvpx/third_party/libyuv/source/cpu_id.cc b/source/libvpx/third_party/libyuv/source/cpu_id.cc index 2e0d61d..8f8a403 100644 --- a/source/libvpx/third_party/libyuv/source/cpu_id.cc +++ b/source/libvpx/third_party/libyuv/source/cpu_id.cc @@ -14,8 +14,9 @@ #include <intrin.h> // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && defined(_M_X64) && \ - defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + !defined(__native_client__) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \ + (defined(_M_IX86) || defined(_M_X64)) #include <immintrin.h> // For _xgetbv() #endif @@ -97,7 +98,7 @@ int TestOsSaveYmm() { uint32 xcr0 = 0u; #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) +#elif defined(_M_IX86) && defined(_MSC_VER) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. @@ -256,12 +257,17 @@ int InitCpuFlags(void) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) cpu_info_ = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#elif defined(__aarch64__) + cpu_info_ = kCpuHasNEON; #else // Linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); diff --git a/source/libvpx/third_party/libyuv/source/format_conversion.cc b/source/libvpx/third_party/libyuv/source/format_conversion.cc index a3daf96..3c17371 100644 --- a/source/libvpx/third_party/libyuv/source/format_conversion.cc +++ b/source/libvpx/third_party/libyuv/source/format_conversion.cc @@ -332,11 +332,13 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif diff --git a/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc index 15b0ed8..36028c3 100644 --- a/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc +++ b/source/libvpx/third_party/libyuv/source/mjpeg_decoder.cc @@ -13,8 +13,8 @@ #ifdef HAVE_JPEG #include <assert.h> -#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\ - !defined(TARGET_IPHONE_SIMULATOR) +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) // Must be included before jpeglib. #include <setjmp.h> #define HAVE_SETJMP @@ -101,7 +101,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { } buf_.data = src; - buf_.len = (int)(src_len); + buf_.len = static_cast<int>(src_len); buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP @@ -411,7 +411,7 @@ void init_source(j_decompress_ptr cinfo) { } boolean fill_input_buffer(j_decompress_ptr cinfo) { - BufferVector* buf_vec = (BufferVector*)(cinfo->client_data); + BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data); if (buf_vec->pos >= buf_vec->len) { assert(0 && "No more data"); // ERROR: No more data @@ -447,7 +447,7 @@ void ErrorHandler(j_common_ptr cinfo) { // ERROR: Error in jpeglib: buf #endif - SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err); + SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err); // This rewinds the call stack to the point of the corresponding setjmp() // and causes it to return (for a second time) with value 1. longjmp(mgr->setjmp_buffer, 1); diff --git a/source/libvpx/third_party/libyuv/source/row_any.cc b/source/libvpx/third_party/libyuv/source/row_any.cc index 97ef844..ce8b3da 100644 --- a/source/libvpx/third_party/libyuv/source/row_any.cc +++ b/source/libvpx/third_party/libyuv/source/row_any.cc @@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +#endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOARGBROW_NEON +#endif // HAS_I422TOUYVYROW_NEON #undef YANY // Wrappers to handle odd width @@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_YUY2TOYROW_NEON YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_UYVYTOYROW_NEON YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RAWTOARGBROW_NEON YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY @@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY diff --git a/source/libvpx/third_party/libyuv/source/row_neon64.cc b/source/libvpx/third_party/libyuv/source/row_neon64.cc index 46e9ceb..21111cf 100644 --- a/source/libvpx/third_party/libyuv/source/row_neon64.cc +++ b/source/libvpx/third_party/libyuv/source/row_neon64.cc @@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store U + "st1 {v0.16b}, [%1], #16 \n" // store U MEMACCESS(2) - "vst1.8 {q1}, [%2]! \n" // store V + "st1 {v1.16b}, [%2], #16 \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_SPLITUVROW_NEON @@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load U + "ld1 {v0.16b}, [%0], #16 \n" // load U MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load V + "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(2) - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV "bgt 1b \n" : "+r"(src_u), // %0 @@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "+r"(dst_uv), // %2 "+r"(width) // %3 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_MERGEUVROW_NEON @@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop MEMACCESS(1) - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_COPYROW_NEON @@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { #ifdef HAS_SETROW_NEON void SetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store + "st1 {v0.16b}, [%0], #16 \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 - : "cc", "memory", "q0" + : "cc", "memory", "v0" ); } #endif // HAS_SETROW_NEON @@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // Start at end of source row. - "mov r3, #-16 \n" "add %0, %0, %2 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" + "st1 {v0.D}[0], [%1], #8 \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" ); } #endif // HAS_MIRRORROW_NEON @@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( // Start at end of source row. - "mov r12, #-16 \n" "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %3, %3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // dst += 8 + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" + "st1 {v1.8b}, [%2], #8 \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0" + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1" ); } #endif // HAS_MIRRORUVROW_NEON @@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // Start at end of source row. - "mov r3, #-16 \n" "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" + "sub %0, %0, #16 \n" ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" + "st1 {v0.D}[0], [%1], #8 \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" ); } #endif // HAS_ARGBMIRRORROW_NEON @@ -1007,20 +1005,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { #ifdef HAS_RGB24TOARGBROW_NEON void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_RGB24TOARGBROW_NEON @@ -1028,21 +1026,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { #ifdef HAS_RAWTOARGBROW_NEON void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v3.8b, v1.8b \n" // move g + "mov v4.8b, v0.8b \n" // move r MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_RAWTOARGBROW_NEON @@ -1170,16 +1169,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_ARGBTORGB24ROW_NEON @@ -1190,17 +1189,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v4.8b, v2.8b \n" // mov g + "mov v5.8b, v1.8b \n" // mov b MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTORAWROW_NEON @@ -1211,16 +1211,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_YUY2TOYROW_NEON @@ -1231,16 +1231,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_UYVYTOYROW_NEON @@ -1252,19 +1252,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_YUY2TOUV422ROW_NEON @@ -1276,19 +1276,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_UYVYTOUV422ROW_NEON @@ -1297,20 +1297,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 @@ -1318,7 +1318,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_YUY2TOUVROW_NEON @@ -1327,20 +1327,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 @@ -1348,7 +1348,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_UYVYTOUVROW_NEON @@ -1358,23 +1358,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %x1, %x0, %w1, sxtw \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. - "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. + "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" + "st1 {v0.16b}, [%2], #16 \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(src_uv_stride), // %1 "+r"(dst_uv), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_HALFROW_NEON @@ -1384,22 +1384,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( - "vmov.u32 d6[0], %3 \n" // selector + "mov v2.s[0], %w3 \n" // selector "1: \n" MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop - "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels - "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels - "vtrn.u32 d4, d5 \n" // combine 8 pixels + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" // store 8. + "st1 {v4.8b}, [%1], #8 \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : "r"(selector) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTOBAYERROW_NEON @@ -1411,16 +1411,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_ARGBTOBAYERGGROW_NEON @@ -1431,21 +1431,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. + "st1 {v1.16b}, [%1], #16 \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List + : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } #endif // HAS_ARGBSHUFFLEROW_NEON @@ -1459,14 +1458,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "mov v2.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1474,7 +1474,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "+r"(dst_yuy2), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOYUY2ROW_NEON @@ -1488,14 +1488,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys + "mov v3.8b, v2.8b \n" MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1503,7 +1504,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "+r"(dst_uyvy), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOUYVYROW_NEON @@ -1577,28 +1578,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, #ifdef HAS_ARGBTOYROW_NEON void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBTOYROW_NEON @@ -1606,26 +1607,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_NEON void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBTOYJROW_NEON @@ -3048,20 +3049,20 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3069,7 +3070,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBMULTIPLYROW_NEON @@ -3083,14 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3098,7 +3101,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBADDROW_NEON @@ -3112,14 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb0), // %0 @@ -3127,7 +3132,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBSUBTRACTROW_NEON @@ -3141,27 +3146,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "mov v1.8b, v0.8b \n" + "mov v2.8b, v0.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELROW_NEON @@ -3175,20 +3180,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add + "uqadd v0.16b, v0.16b, v1.16b \n" // add MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1" ); } #endif // HAS_SOBELTOPLANEROW_NEON @@ -3202,25 +3207,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add + "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELXYROW_NEON @@ -3236,28 +3241,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top + "ld1 {v0.8b}, [%0],%5 \n" // top MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom + "ld1 {v2.8b}, [%2],%5 \n" // bottom MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" + "ld1 {v3.8b}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3266,7 +3271,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %4 : "r"(2), // %5 "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELXROW_NEON @@ -3282,28 +3287,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left + "ld1 {v0.8b}, [%0],%4 \n" // left MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right + "ld1 {v2.8b}, [%0],%5 \n" // right MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" + "ld1 {v3.8b}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3311,7 +3316,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %3 : "r"(1), // %4 "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELYROW_NEON diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc index 8eb8889..d79c353 100644 --- a/source/libvpx/third_party/libyuv/source/row_win.cc +++ b/source/libvpx/third_party/libyuv/source/row_win.cc @@ -10,7 +10,7 @@ #include "libyuv/row.h" -#if defined (_M_X64) +#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #include <emmintrin.h> #include <tmmintrin.h> // For _mm_maddubs_epi16 #endif @@ -21,7 +21,8 @@ extern "C" { #endif // This module is for Visual C. -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + (defined(_M_IX86) || defined(_M_X64)) #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ @@ -78,7 +79,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_argb, int width) { - __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm4 = _mm_setzero_si128(); @@ -132,7 +132,6 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_argb, int width) { - __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm4 = _mm_setzero_si128(); diff --git a/source/libvpx/third_party/libyuv/source/scale_neon64.cc b/source/libvpx/third_party/libyuv/source/scale_neon64.cc new file mode 100644 index 0000000..64c7d10 --- /dev/null +++ b/source/libvpx/third_party/libyuv/source/scale_neon64.cc @@ -0,0 +1,790 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + MEMACCESS(1) + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} +#endif //ScaleRowDown34_0_Box_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile ( + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(4) + "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +#if 0 +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + MEMACCESS(0) + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} +#endif //0 + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} +#endif //HAS_SCALEARGBROWDOWNEVEN_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_SCALEARGBROWDOWNEVEN_NEON +#endif // __aarch64__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/libvpx/tools_common.c b/source/libvpx/tools_common.c index 7cfd066..2ec1711 100644 --- a/source/libvpx/tools_common.c +++ b/source/libvpx/tools_common.c @@ -83,7 +83,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { struct FileTypeDetectionBuffer *detect = &input_ctx->detect; int plane = 0; int shortread = 0; - const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1; + const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { uint8_t *ptr; @@ -241,7 +241,8 @@ int vpx_img_read(vpx_image_t *img, FILE *file) { for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane); + const int w = vpx_img_plane_width(img, plane) * + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); const int h = vpx_img_plane_height(img, plane); int y; diff --git a/source/libvpx/tools_common.h b/source/libvpx/tools_common.h index 558413e..c1f466b 100644 --- a/source/libvpx/tools_common.h +++ b/source/libvpx/tools_common.h @@ -103,17 +103,25 @@ struct VpxInputContext { extern "C" { #endif +#if defined(__GNUC__) +#define VPX_NO_RETURN __attribute__((noreturn)) +#else +#define VPX_NO_RETURN +#endif + /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); -void die(const char *fmt, ...); -void fatal(const char *fmt, ...); +void die(const char *fmt, ...) VPX_NO_RETURN; +void fatal(const char *fmt, ...) VPX_NO_RETURN; void warn(const char *fmt, ...); -void die_codec(vpx_codec_ctx_t *ctx, const char *s); +void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN; /* The tool including this file must define usage_exit() */ -void usage_exit(); +void usage_exit() VPX_NO_RETURN; + +#undef VPX_NO_RETURN int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); diff --git a/source/libvpx/vp8/common/arm/loopfilter_arm.c b/source/libvpx/vp8/common/arm/loopfilter_arm.c index f37ca63..5840c2b 100644 --- a/source/libvpx/vp8/common/arm/loopfilter_arm.c +++ b/source/libvpx/vp8/common/arm/loopfilter_arm.c @@ -25,22 +25,18 @@ extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); #endif -#if HAVE_NEON_ASM || HAVE_NEON +#if HAVE_NEON typedef void loopfilter_y_neon(unsigned char *src, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh); typedef void loopfilter_uv_neon(unsigned char *u, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh, unsigned char *v); -#endif -#if HAVE_NEON_ASM extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; -#endif -#if HAVE_NEON extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; @@ -150,9 +146,7 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign if (u_ptr) vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } -#endif -#if HAVE_NEON_ASM /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) diff --git a/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c index d77f2ba..9824a31 100644 --- a/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c +++ b/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -10,7 +10,7 @@ #include <arm_neon.h> -static const uint16_t bifilter4_coeff[8][2] = { +static const uint8_t bifilter4_coeff[8][2] = { {128, 0}, {112, 16}, { 96, 32}, @@ -64,8 +64,8 @@ void vp8_bilinear_predict4x4_neon( q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); - d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); - d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); @@ -155,8 +155,8 @@ void vp8_bilinear_predict8x4_neon( q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q5u8 = vld1q_u8(src_ptr); - d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); - d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); @@ -245,8 +245,8 @@ void vp8_bilinear_predict8x8_neon( q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; - d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); - d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); diff --git a/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm deleted file mode 100644 index a8730aa..0000000 --- a/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm +++ /dev/null @@ -1,595 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_build_intra_predictors_mby_neon_func| - EXPORT |vp8_build_intra_predictors_mby_s_neon_func| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_neon_func| PROC - push {r4-r8, lr} - vpush {d8-d15} - - cmp r3, #0 - beq case_dc_pred - cmp r3, #1 - beq case_v_pred - cmp r3, #2 - beq case_h_pred - cmp r3, #3 - beq case_tm_pred - -case_dc_pred - ldr r4, [sp, #88] ; Up - ldr r5, [sp, #92] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up - - cmp r5, #0 - beq skip_dc_pred_left - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - - vpop {d8-d15} - pop {r4-r8,pc} -case_v_pred - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vpop {d8-d15} - pop {r4-r8,pc} - -case_h_pred - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - vpop {d8-d15} - pop {r4-r8,pc} - -case_tm_pred - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - subs r12, r12, #1 - bne case_tm_pred_loop - - vpop {d8-d15} - pop {r4-r8,pc} - - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_s_neon_func| PROC - push {r4-r8, lr} - vpush {d8-d15} - - mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; - - cmp r3, #0 - beq case_dc_pred_s - cmp r3, #1 - beq case_v_pred_s - cmp r3, #2 - beq case_h_pred_s - cmp r3, #3 - beq case_tm_pred_s - -case_dc_pred_s - ldr r4, [sp, #88] ; Up - ldr r5, [sp, #92] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left_s - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up_s - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up_s - - cmp r5, #0 - beq skip_dc_pred_left_s - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left_s - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left_s - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} -case_v_pred_s - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} - -case_h_pred_s - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - vpop {d8-d15} - pop {r4-r8,pc} - -case_tm_pred_s - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop_s - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - subs r12, r12, #1 - bne case_tm_pred_loop_s - - vpop {d8-d15} - pop {r4-r8,pc} - - ENDP - - - END diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm deleted file mode 100644 index 3a39210..0000000 --- a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm +++ /dev/null @@ -1,81 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_0_2x_neon(short *q, short dq, -; unsigned char *dst, int stride); -; r0 *q -; r1 dq -; r2 *dst -; r3 stride -|idct_dequant_0_2x_neon| PROC - push {r4, r5} - vpush {d8-d15} - - add r12, r2, #4 - vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d8[0]}, [r12], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d10[1]}, [r12], r3 - - ldrh r12, [r0] ; lo q - ldrh r4, [r0, #32] ; hi q - mov r5, #0 - strh r5, [r0] - strh r5, [r0, #32] - - sxth r12, r12 ; lo - mul r0, r12, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q0, r0 - sxth r4, r4 ; hi - mul r0, r4, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r0, r2, #4 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d10[1]}, [r0] - - vpop {d8-d15} - pop {r4, r5} - bx lr - - ENDP ; |idct_dequant_0_2x_neon| - END diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 0000000..967c322 --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32, d4s32; + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm deleted file mode 100644 index 8da0fa0..0000000 --- a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm +++ /dev/null @@ -1,199 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_full_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -|idct_dequant_full_2x_neon| PROC - vpush {d8-d15} - - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r3 ; l pre - vld1.32 {d28[1]}, [r12], r3 ; r pre - vld1.32 {d29[0]}, [r2], r3 - vld1.32 {d29[1]}, [r12], r3 - vld1.32 {d30[0]}, [r2], r3 - vld1.32 {d30[1]}, [r12], r3 - vld1.32 {d31[0]}, [r2], r3 - vld1.32 {d31[1]}, [r12] - - adr r1, cospi8sqrt2minus1 ; pointer to the first constant - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - vld1.16 {d0}, [r1] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r1, r2, #4 ; hi - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - vst1.32 {d0[0]}, [r2], r3 ; lo - vst1.32 {d0[1]}, [r1], r3 ; hi - vst1.32 {d1[0]}, [r2], r3 - vst1.32 {d1[1]}, [r1], r3 - vst1.32 {d2[0]}, [r2], r3 - vst1.32 {d2[1]}, [r1], r3 - vst1.32 {d3[0]}, [r2] - vst1.32 {d3[1]}, [r1] - - vpop {d8-d15} - bx lr - - ENDP ; |idct_dequant_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c new file mode 100644 index 0000000..a60ed46 --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +void idct_dequant_full_2x_neon( + int16_t *q, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), + vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), + vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), + vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), + vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); + return; +} diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm deleted file mode 100644 index c4f09c7..0000000 --- a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm +++ /dev/null @@ -1,409 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_horizontal_edge_y_neon| - EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_loop_filter_vertical_edge_y_neon| - EXPORT |vp8_loop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_loop_filter_horizontal_edge_y_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #68] ; load thresh - add r12, r2, r1 - add r1, r1, r1 - - vdup.u8 q2, r3 ; duplicate thresh - - vld1.u8 {q3}, [r2@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r2@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r2@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r2@128] ; q2 - vld1.u8 {q10}, [r12@128] ; q3 - - sub r2, r2, r1, lsl #1 - sub r12, r12, r1, lsl #1 - - bl vp8_loop_filter_neon - - vst1.u8 {q5}, [r2@128], r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r2@128], r1 ; store oq0 - vst1.u8 {q8}, [r12@128], r1 ; store oq1 - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - - -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_horizontal_edge_uv_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #68] ; load thresh - ldr r2, [sp, #72] ; load v ptr - vdup.u8 q2, r12 ; duplicate thresh - - sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r3@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r3@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r3@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r3@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r3@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r3@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r3@64] ; q3 - vld1.u8 {d21}, [r12@64] ; q3 - - bl vp8_loop_filter_neon - - sub r0, r0, r1, lsl #1 - sub r2, r2, r1, lsl #1 - - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r2@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r2@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r2@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64] ; store u oq1 - vst1.u8 {d17}, [r2@64] ; store v oq1 - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| - -; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, - -|vp8_loop_filter_vertical_edge_y_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, #4 ; src ptr down by 4 columns - add r1, r1, r1 - ldr r3, [sp, #68] ; load thresh - add r12, r2, r1, asr #1 - - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r12], r1 - - vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d19}, [r2] - vld1.u8 {d21}, [r12] - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r3 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - - sub r0, r0, #2 ; dst ptr - - vswp d14, d12 - vswp d16, d15 - - add r12, r0, r1, asr #1 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_y_neon| - -; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_vertical_edge_uv_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #72] ; load v ptr - vdup.u8 q1, r3 ; duplicate limit - sub r3, r2, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r12] - vld1.u8 {d21}, [r3] - - ldr r12, [sp, #68] ; load thresh - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r12 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - vswp d14, d12 - vswp d16, d15 - - sub r0, r0, #2 - sub r2, r2, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| - -; void vp8_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. - -; r0-r3 PRESERVE -; q0 flimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -|vp8_loop_filter_neon| PROC - - ; vp8_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) - vabd.u8 q4, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q4 - vmax.u8 q15, q11, q12 - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 - - vmov.u8 q10, #0x80 ; 0x80 - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - vcge.u8 q15, q1, q15 - - ; vp8_filter() function - ; convert to signed - veor q7, q7, q10 ; qs0 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u8 q10, #3 ; #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - - vmovl.u8 q4, d20 - - vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; vp8_hevmask - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; vp8_filter &= hev - vand q15, q15, q9 ; vp8_filter_mask - - vaddw.s8 q2, q2, d2 - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 ; #4 - - ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3) - vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) - - ; outer tap adjustments: ++vp8_filter >> 1 - vrshr.s8 q1, q1, #1 - vbic q1, q1, q14 ; vp8_filter &= ~hev - vmov.u8 q0, #0x80 ; 0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - veor q5, q13, q0 ; *op1 = u^0x80 - veor q8, q12, q0 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - -;----------------- - - END diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c new file mode 100644 index 0000000..0bec7fb --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c @@ -0,0 +1,549 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#else + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#endif +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm deleted file mode 100644 index 78d13c8..0000000 --- a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ /dev/null @@ -1,156 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_bvs_neon| - EXPORT |vp8_loop_filter_mbvs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp8_loop_filter_simple_vertical_edge_neon| PROC - vpush {d8-d15} - - sub r0, r0, #2 ; move src pointer down by 2 columns - add r12, r1, r1 - add r3, r0, r1 - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] - - vswp d7, d10 - vswp d12, d9 - - ;vp8_filter_mask() function - ;vp8_hevmask() function - sub r0, r0, r1, lsl #4 - vabd.u8 q15, q5, q4 ; abs(p0 - q0) - vabd.u8 q14, q3, q6 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q11, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value - veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value - veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value - veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - - vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) - vsubl.s8 q13, d9, d11 - - vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) - vmul.s16 q13, q13, q11 - - vmov.u8 q11, #0x03 ; 0x03 - vmov.u8 q12, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d29 - - vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d29, q13 - - add r0, r0, #1 - add r3, r0, r1 - - vand q14, q14, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q14, q3, #3 ; Filter1 >>= 3 - - ;calculate output - vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - add r12, r1, r1 - vswp d13, d14 - - ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0], r12 - vst2.8 {d12[1], d13[1]}, [r3], r12 - vst2.8 {d12[2], d13[2]}, [r0], r12 - vst2.8 {d12[3], d13[3]}, [r3], r12 - vst2.8 {d12[4], d13[4]}, [r0], r12 - vst2.8 {d12[5], d13[5]}, [r3], r12 - vst2.8 {d12[6], d13[6]}, [r0], r12 - vst2.8 {d12[7], d13[7]}, [r3], r12 - vst2.8 {d14[0], d15[0]}, [r0], r12 - vst2.8 {d14[1], d15[1]}, [r3], r12 - vst2.8 {d14[2], d15[2]}, [r0], r12 - vst2.8 {d14[3], d15[3]}, [r3], r12 - vst2.8 {d14[4], d15[4]}, [r0], r12 - vst2.8 {d14[5], d15[5]}, [r3], r12 - vst2.8 {d14[6], d15[6]}, [r0], r12 - vst2.8 {d14[7], d15[7]}, [r3] - - vpop {d8-d15} - bx lr - ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_bvs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - mov r4, r0 - add r0, r0, #4 - vdup.s8 q1, r3 ; duplicate blim - bl vp8_loop_filter_simple_vertical_edge_neon - ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 - add r0, r4, #8 - bl vp8_loop_filter_simple_vertical_edge_neon - add r0, r4, #12 - pop {r4, lr} - b vp8_loop_filter_simple_vertical_edge_neon - ENDP ;|vp8_loop_filter_bvs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_mbvs_neon| PROC - ldrb r3, [r2] ; load mblim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp8_loop_filter_simple_vertical_edge_neon - ENDP ;|vp8_loop_filter_bvs_neon| - END diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 0000000..d5178bb --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#else +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], + result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#endif + + +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#else +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), + vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), + vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), + vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), + vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#endif + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p, d0u8x4); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p, d1u8x4); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), + vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/source/libvpx/vp8/common/arm/neon/reconintra_neon.c b/source/libvpx/vp8/common/arm/neon/reconintra_neon.c new file mode 100644 index 0000000..af52cd5 --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/reconintra_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { + const int mode = x->mode_info_context->mbmi.mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x16_t v_expected_dc = vdupq_n_u8(128); + + if (shift) { + unsigned int average = 0; + int expected_dc; + if (x->up_available) { + const uint8x16_t v_above = vld1q_u8(yabove_row); + const uint16x8_t a = vpaddlq_u8(v_above); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + average = vget_lane_u32(d, 0); + } + if (x->left_available) { + for (i = 0; i < 16; ++i) { + average += yleft[0]; + yleft += left_stride; + } + } + shift += 3; + expected_dc = (average + (1 << (shift - 1))) >> shift; + v_expected_dc = vmovq_n_u8((uint8_t)expected_dc); + } + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_expected_dc); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_above); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 16; ++i) { + const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]); + yleft += left_stride; + vst1q_u8(ypred_ptr, v_yleft); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]); + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]); + const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft); + const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft); + const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo), + vreinterpretq_s16_u16(v_ytop_left)); + const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi), + vreinterpretq_s16_u16(v_ytop_left)); + const uint8x8_t pred_lo = vqmovun_s16(b_lo); + const uint8x8_t pred_hi = vqmovun_s16(b_hi); + + vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi)); + ypred_ptr += y_stride; + yleft += left_stride; + } + } + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { + const int mode = x->mode_info_context->mbmi.uv_mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x8_t v_expected_udc = vdup_n_u8(128); + uint8x8_t v_expected_vdc = vdup_n_u8(128); + + if (shift) { + unsigned int average_u = 0; + unsigned int average_v = 0; + int expected_udc; + int expected_vdc; + if (x->up_available) { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove)); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0); + average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2); + } + if (x->left_available) { + for (i = 0; i < 8; ++i) { + average_u += uleft[0]; + uleft += left_stride; + average_v += vleft[0]; + vleft += left_stride; + } + } + shift += 2; + expected_udc = (average_u + (1 << (shift - 1))) >> shift; + expected_vdc = (average_v + (1 << (shift - 1))) >> shift; + v_expected_udc = vmov_n_u8((uint8_t)expected_udc); + v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc); + } + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_expected_udc); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_expected_vdc); + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_uabove); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vabove); + vpred_ptr += pred_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]); + uleft += left_stride; + vleft += left_stride; + vst1_u8(upred_ptr, v_uleft); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vleft); + vpred_ptr += pred_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]); + const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]); + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]); + const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft); + const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft); + const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u), + vreinterpretq_s16_u16(v_utop_left)); + const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v), + vreinterpretq_s16_u16(v_vtop_left)); + const uint8x8_t pred_u = vqmovun_s16(b_u); + const uint8x8_t pred_v = vqmovun_s16(b_v); + + vst1_u8(upred_ptr, pred_u); + vst1_u8(vpred_ptr, pred_v); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + uleft += left_stride; + vleft += left_stride; + } + } + break; + } +} diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm deleted file mode 100644 index adc5b7e..0000000 --- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ /dev/null @@ -1,425 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -;----------------- - - EXPORT |vp8_sub_pixel_variance16x16_neon_func| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pixels_per_line, -; stack(r6) unsigned int *sse -;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. - -bilinear_taps_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - -|vp8_sub_pixel_variance16x16_neon_func| PROC - push {r4-r6, lr} - vpush {d8-d15} - - adr r12, bilinear_taps_coeff - ldr r4, [sp, #80] ;load *dst_ptr from stack - ldr r5, [sp, #84] ;load dst_pixels_per_line from stack - ldr r6, [sp, #88] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -vp8e_filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne vp8e_filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - sub sp, sp, #256 - mov r3, sp - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -vp8e_filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - vst1.u8 {d4, d5}, [r3]! - vst1.u8 {d6, d7}, [r3]! - vmov q11, q15 - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_sp16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - sub sp, sp, #528 ;reserve space on stack for temporary storage - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - mov r3, sp - -;First Pass: output_height lines x output_width columns (16x16) -vp8e_filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r3]! ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r3]! - vst1.u8 {d18, d19}, [r3]! - vst1.u8 {d20, d21}, [r3]! - - bne vp8e_filt_blk2d_fpo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - sub sp, sp, #528 ;reserve space on stack for temporary storage - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - mov r3, sp - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -vp8e_filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r3]! - vmov q11, q15 - vst1.u8 {d6, d7}, [r3]! - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_spo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;---------------------------- -;variance16x16 -sub_pixel_variance16x16_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - sub r3, r3, #256 - mov r12, #8 - -sub_pixel_variance16x16_neon_loop - vld1.8 {q0}, [r3]! ;Load up source and reference - vld1.8 {q2}, [r4], r5 - vld1.8 {q1}, [r3]! - vld1.8 {q3}, [r4], r5 - - vsubl.u8 q11, d0, d4 ;diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne sub_pixel_variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r6] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - add sp, sp, #528 - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {r4-r6,pc} - - ENDP - - END diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm deleted file mode 100644 index b0829af..0000000 --- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm +++ /dev/null @@ -1,583 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance_halfpixvar16x16_h_neon| - EXPORT |vp8_variance_halfpixvar16x16_v_neon| - EXPORT |vp8_variance_halfpixvar16x16_hv_neon| - EXPORT |vp8_sub_pixel_variance16x16s_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_h_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_h_neon| PROC - push {lr} - vpush {d8-d15} - - mov r12, #4 ;loop counter - ldr lr, [sp, #68] ;load *sse from stack - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -;First Pass: output_height lines x output_width columns (16x16) -vp8_filt_fpo16x16s_4_0_loop_neon - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - vld1.8 {q11}, [r2], r3 - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.8 {q12}, [r2], r3 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.8 {q13}, [r2], r3 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - vext.8 q3, q2, q3, #1 - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vld1.8 {q14}, [r2], r3 - vrhadd.u8 q1, q2, q3 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - - vsubl.u8 q4, d0, d22 ;diff - vsubl.u8 q5, d1, d23 - vsubl.u8 q6, d2, d24 - vsubl.u8 q7, d3, d25 - vsubl.u8 q0, d4, d26 - vsubl.u8 q1, d5, d27 - vsubl.u8 q2, d6, d28 - vsubl.u8 q3, d7, d29 - - vpadal.s16 q8, q4 ;sum - vmlal.s16 q9, d8, d8 ;sse - vmlal.s16 q10, d9, d9 - - subs r12, r12, #1 - - vpadal.s16 q8, q5 - vmlal.s16 q9, d10, d10 - vmlal.s16 q10, d11, d11 - vpadal.s16 q8, q6 - vmlal.s16 q9, d12, d12 - vmlal.s16 q10, d13, d13 - vpadal.s16 q8, q7 - vmlal.s16 q9, d14, d14 - vmlal.s16 q10, d15, d15 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_fpo16x16s_4_0_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {pc} - ENDP - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_v_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_v_neon| PROC - push {lr} - vpush {d8-d15} - - mov r12, #4 ;loop counter - - vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #68] ;load *sse from stack - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -vp8_filt_spo16x16s_0_4_loop_neon - vld1.u8 {q2}, [r0], r1 - vld1.8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vld1.8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vld1.8 {q5}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - - vrhadd.u8 q0, q0, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q4, q4, q6 - vrhadd.u8 q6, q6, q15 - - vsubl.u8 q11, d0, d2 ;diff - vsubl.u8 q12, d1, d3 - vsubl.u8 q13, d4, d6 - vsubl.u8 q14, d5, d7 - vsubl.u8 q0, d8, d10 - vsubl.u8 q1, d9, d11 - vsubl.u8 q2, d12, d14 - vsubl.u8 q3, d13, d15 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - - vmov q0, q15 - - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_spo16x16s_0_4_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {pc} - ENDP - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_hv_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_hv_neon| PROC - push {lr} - vpush {d8-d15} - - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - - ldr lr, [sp, #68] ;load *sse from stack - vmov.i8 q13, #0 ;q8 - sum - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - - vmov.i8 q14, #0 ;q9, q10 - sse - vmov.i8 q15, #0 - - mov r12, #4 ;loop counter - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - -;First Pass: output_height lines x output_width columns (17x16) -vp8_filt16x16s_4_4_loop_neon - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q3, q2, q3, #1 ;construct src_ptr[1] - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - vext.8 q9, q8, q9, #1 - - vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - - vld1.8 {q5}, [r2], r3 - vrhadd.u8 q0, q0, q1 - vld1.8 {q6}, [r2], r3 - vrhadd.u8 q1, q1, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q3 - vld1.8 {q8}, [r2], r3 - vrhadd.u8 q3, q3, q4 - - vsubl.u8 q9, d0, d10 ;diff - vsubl.u8 q10, d1, d11 - vsubl.u8 q11, d2, d12 - vsubl.u8 q12, d3, d13 - - vsubl.u8 q0, d4, d14 ;diff - vsubl.u8 q1, d5, d15 - vsubl.u8 q5, d6, d16 - vsubl.u8 q6, d7, d17 - - vpadal.s16 q13, q9 ;sum - vmlal.s16 q14, d18, d18 ;sse - vmlal.s16 q15, d19, d19 - - vpadal.s16 q13, q10 ;sum - vmlal.s16 q14, d20, d20 ;sse - vmlal.s16 q15, d21, d21 - - vpadal.s16 q13, q11 ;sum - vmlal.s16 q14, d22, d22 ;sse - vmlal.s16 q15, d23, d23 - - vpadal.s16 q13, q12 ;sum - vmlal.s16 q14, d24, d24 ;sse - vmlal.s16 q15, d25, d25 - - subs r12, r12, #1 - - vpadal.s16 q13, q0 ;sum - vmlal.s16 q14, d0, d0 ;sse - vmlal.s16 q15, d1, d1 - - vpadal.s16 q13, q1 ;sum - vmlal.s16 q14, d2, d2 ;sse - vmlal.s16 q15, d3, d3 - - vpadal.s16 q13, q5 ;sum - vmlal.s16 q14, d10, d10 ;sse - vmlal.s16 q15, d11, d11 - - vmov q0, q4 - - vpadal.s16 q13, q6 ;sum - vmlal.s16 q14, d12, d12 ;sse - vmlal.s16 q15, d13, d13 - - bne vp8_filt16x16s_4_4_loop_neon - - vadd.u32 q15, q14, q15 ;accumulate sse - vpaddl.s32 q0, q13 ;accumulate sum - - vpaddl.u32 q1, q15 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {pc} - ENDP - -;============================== -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pixels_per_line, -; stack unsigned int *sse -;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step() -;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter, -;or filter coeff is {64, 64}. This simplified program only works in this situation. -;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later. - -|vp8_sub_pixel_variance16x16s_neon| PROC - push {r4, lr} - vpush {d8-d15} - - ldr r4, [sp, #72] ;load *dst_ptr from stack - ldr r12, [sp, #76] ;load dst_pixels_per_line from stack - ldr lr, [sp, #80] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16s_only - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq firstpass_bfilter16x16s_only - - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - sub sp, sp, #256 ;reserve space on stack for temporary storage - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - mov r3, sp - mov r2, #4 ;loop counter - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - -;First Pass: output_height lines x output_width columns (17x16) -vp8e_filt_blk2d_fp16x16s_loop_neon - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q3, q2, q3, #1 ;construct src_ptr[1] - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - vext.8 q9, q8, q9, #1 - - vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - - vrhadd.u8 q0, q0, q1 - vrhadd.u8 q1, q1, q2 - vrhadd.u8 q2, q2, q3 - vrhadd.u8 q3, q3, q4 - - subs r2, r2, #1 - vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result - vmov q0, q4 - vst1.u8 {d4, d5, d6, d7}, [r3]! - - bne vp8e_filt_blk2d_fp16x16s_loop_neon - - b sub_pixel_variance16x16s_neon - -;-------------------- -firstpass_bfilter16x16s_only - mov r2, #2 ;loop counter - sub sp, sp, #256 ;reserve space on stack for temporary storage - mov r3, sp - -;First Pass: output_height lines x output_width columns (16x16) -vp8e_filt_blk2d_fpo16x16s_loop_neon - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - vext.8 q3, q2, q3, #1 - vld1.u8 {d20, d21, d22, d23}, [r0], r1 - vext.8 q5, q4, q5, #1 - vld1.u8 {d24, d25, d26, d27}, [r0], r1 - vext.8 q7, q6, q7, #1 - vld1.u8 {d28, d29, d30, d31}, [r0], r1 - vext.8 q9, q8, q9, #1 - vext.8 q11, q10, q11, #1 - vext.8 q13, q12, q13, #1 - vext.8 q15, q14, q15, #1 - - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q1, q2, q3 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - vrhadd.u8 q5, q10, q11 - vrhadd.u8 q6, q12, q13 - vrhadd.u8 q7, q14, q15 - - subs r2, r2, #1 - - vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result - vst1.u8 {d4, d5, d6, d7}, [r3]! - vst1.u8 {d8, d9, d10, d11}, [r3]! - vst1.u8 {d12, d13, d14, d15}, [r3]! - - bne vp8e_filt_blk2d_fpo16x16s_loop_neon - - b sub_pixel_variance16x16s_neon - -;--------------------- -secondpass_bfilter16x16s_only - sub sp, sp, #256 ;reserve space on stack for temporary storage - - mov r2, #2 ;loop counter - vld1.u8 {d0, d1}, [r0], r1 ;load src data - mov r3, sp - -vp8e_filt_blk2d_spo16x16s_loop_neon - vld1.u8 {d2, d3}, [r0], r1 - vld1.u8 {d4, d5}, [r0], r1 - vld1.u8 {d6, d7}, [r0], r1 - vld1.u8 {d8, d9}, [r0], r1 - - vrhadd.u8 q0, q0, q1 - vld1.u8 {d10, d11}, [r0], r1 - vrhadd.u8 q1, q1, q2 - vld1.u8 {d12, d13}, [r0], r1 - vrhadd.u8 q2, q2, q3 - vld1.u8 {d14, d15}, [r0], r1 - vrhadd.u8 q3, q3, q4 - vld1.u8 {d16, d17}, [r0], r1 - vrhadd.u8 q4, q4, q5 - vrhadd.u8 q5, q5, q6 - vrhadd.u8 q6, q6, q7 - vrhadd.u8 q7, q7, q8 - - subs r2, r2, #1 - - vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result - vmov q0, q8 - vst1.u8 {d4, d5, d6, d7}, [r3]! - vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result - vst1.u8 {d12, d13, d14, d15}, [r3]! - - bne vp8e_filt_blk2d_spo16x16s_loop_neon - - b sub_pixel_variance16x16s_neon - -;---------------------------- -;variance16x16 -sub_pixel_variance16x16s_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - sub r3, r3, #256 - mov r2, #4 - -sub_pixel_variance16x16s_neon_loop - vld1.8 {q0}, [r3]! ;Load up source and reference - vld1.8 {q1}, [r4], r12 - vld1.8 {q2}, [r3]! - vld1.8 {q3}, [r4], r12 - vld1.8 {q4}, [r3]! - vld1.8 {q5}, [r4], r12 - vld1.8 {q6}, [r3]! - vld1.8 {q7}, [r4], r12 - - vsubl.u8 q11, d0, d2 ;diff - vsubl.u8 q12, d1, d3 - vsubl.u8 q13, d4, d6 - vsubl.u8 q14, d5, d7 - vsubl.u8 q0, d8, d10 - vsubl.u8 q1, d9, d11 - vsubl.u8 q2, d12, d14 - vsubl.u8 q3, d13, d15 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r2, r2, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne sub_pixel_variance16x16s_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - add sp, sp, #256 - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {r4, pc} - ENDP - - END diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm deleted file mode 100644 index 9d9f9e0..0000000 --- a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm +++ /dev/null @@ -1,225 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sub_pixel_variance8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pixels_per_line, -; stack(r6) unsigned int *sse -;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon. - -|vp8_sub_pixel_variance8x8_neon| PROC - push {r4-r5, lr} - vpush {d8-d15} - - adr r12, bilinear_taps_coeff - ldr r4, [sp, #76] ;load *dst_ptr from stack - ldr r5, [sp, #80] ;load dst_pixels_per_line from stack - ldr lr, [sp, #84] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - ;skip_secondpass_filter - beq sub_pixel_variance8x8_neon - - add r3, r12, r3, lsl #3 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q2, #7 - vqrshrn.u16 d24, q3, #7 - vqrshrn.u16 d25, q4, #7 - vqrshrn.u16 d26, q5, #7 - vqrshrn.u16 d27, q6, #7 - vqrshrn.u16 d28, q7, #7 - vqrshrn.u16 d29, q8, #7 - - b sub_pixel_variance8x8_neon - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;---------------------- -;vp8_variance8x8_neon -sub_pixel_variance8x8_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #2 - -sub_pixel_variance8x8_neon_loop - vld1.8 {d0}, [r4], r5 ;load dst data - subs r12, r12, #1 - vld1.8 {d1}, [r4], r5 - vld1.8 {d2}, [r4], r5 - vsubl.u8 q4, d22, d0 ;calculate diff - vld1.8 {d3}, [r4], r5 - - vsubl.u8 q5, d23, d1 - vsubl.u8 q6, d24, d2 - - vpadal.s16 q8, q4 ;sum - vmlal.s16 q9, d8, d8 ;sse - vmlal.s16 q10, d9, d9 - - vsubl.u8 q7, d25, d3 - - vpadal.s16 q8, q5 - vmlal.s16 q9, d10, d10 - vmlal.s16 q10, d11, d11 - - vmov q11, q13 - - vpadal.s16 q8, q6 - vmlal.s16 q9, d12, d12 - vmlal.s16 q10, d13, d13 - - vmov q12, q14 - - vpadal.s16 q8, q7 - vmlal.s16 q9, d14, d14 - vmlal.s16 q10, d15, d15 - - bne sub_pixel_variance8x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #6 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {d8-d15} - pop {r4-r5, pc} - - ENDP - -;----------------- - -bilinear_taps_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c new file mode 100644 index 0000000..6405bf2 --- /dev/null +++ b/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c @@ -0,0 +1,1028 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +static const uint16_t bilinear_taps_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +unsigned int vp8_sub_pixel_variance16x16_neon_func( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + int i; + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); + unsigned char *tmpp; + unsigned char *tmpp2; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + tmpp2 = tmp + 272; + tmpp = tmp; + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } else if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp2, q7u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q8u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q9u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q10u8); + tmpp2 += 16; + } + } else { + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + tmpp = tmp; + tmpp2 = tmpp + 272; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } + + // sub_pixel_variance16x16_neon + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + tmpp = tmp + 272; + for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop + q0u8 = vld1q_u8(tmpp); + tmpp += 16; + q1u8 = vld1q_u8(tmpp); + tmpp += 16; + q2u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + q3u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); + q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_h_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; + uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q11u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q12u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q13u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q14u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q1u8 = vextq_u8(q0u8, q1u8, 1); + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); + q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); + q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); + q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); + q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); + q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); + + d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); + d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); + q9s32 = vmlal_s16(q9s32, d8s16, d8s16); + q10s32 = vmlal_s16(q10s32, d9s16, d9s16); + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); + q9s32 = vmlal_s16(q9s32, d10s16, d10s16); + q10s32 = vmlal_s16(q10s32, d11s16, d11s16); + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); + q9s32 = vmlal_s16(q9s32, d12s16, d12s16); + q10s32 = vmlal_s16(q10s32, d13s16, d13s16); + d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); + d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); + q9s32 = vmlal_s16(q9s32, d14s16, d14s16); + q10s32 = vmlal_s16(q10s32, d15s16, d15s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_v_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q15u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + + q1u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q4u8); + q4u8 = vrhaddq_u8(q4u8, q6u8); + q6u8 = vrhaddq_u8(q6u8, q15u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d8u8 = vget_low_u8(q4u8); + d9u8 = vget_high_u8(q4u8); + d12u8 = vget_low_u8(q6u8); + d13u8 = vget_high_u8(q6u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); + q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); + q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); + q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); + q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); + q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + + q0u8 = q15u8; + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_hv_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; + int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; + int32x4_t q13s32, q14s32, q15s32; + int64x2_t q0s64, q1s64, q5s64; + + q13s32 = vdupq_n_s32(0); + q14s32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q1u8 = vextq_u8(q0u8, q1u8, 1); + q0u8 = vrhaddq_u8(q0u8, q1u8); + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q8u8 = vld1q_u8(src_ptr); + q9u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + q9u8 = vextq_u8(q8u8, q9u8, 1); + + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + q4u8 = vrhaddq_u8(q8u8, q9u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q1u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q3u8); + q3u8 = vrhaddq_u8(q3u8, q4u8); + + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q6u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q8u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); + q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); + q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); + q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); + q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); + q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); + q14s32 = vmlal_s16(q14s32, d18s16, d18s16); + q15s32 = vmlal_s16(q15s32, d19s16, d19s16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); + q14s32 = vmlal_s16(q14s32, d20s16, d20s16); + q15s32 = vmlal_s16(q15s32, d21s16, d21s16); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); + q14s32 = vmlal_s16(q14s32, d22s16, d22s16); + q15s32 = vmlal_s16(q15s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); + q14s32 = vmlal_s16(q14s32, d24s16, d24s16); + q15s32 = vmlal_s16(q15s32, d25s16, d25s16); + + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); + q14s32 = vmlal_s16(q14s32, d0s16, d0s16); + q15s32 = vmlal_s16(q15s32, d1s16, d1s16); + + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); + q14s32 = vmlal_s16(q14s32, d2s16, d2s16); + q15s32 = vmlal_s16(q15s32, d3s16, d3s16); + + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); + q14s32 = vmlal_s16(q14s32, d10s16, d10s16); + q15s32 = vmlal_s16(q15s32, d11s16, d11s16); + + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); + q14s32 = vmlal_s16(q14s32, d12s16, d12s16); + q15s32 = vmlal_s16(q15s32, d13s16, d13s16); + + q0u8 = q4u8; + } + + q15s32 = vaddq_s32(q14s32, q15s32); + q0s64 = vpaddlq_s32(q13s32); + q1s64 = vpaddlq_s32(q15s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +enum { kWidth8 = 8 }; +enum { kHeight8 = 8 }; +enum { kHeight8PlusOne = 9 }; +enum { kPixelStepOne = 1 }; +enum { kAlign16 = 16 }; + +#define FILTER_BITS 7 + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint16_t *vpx_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp8_sub_pixel_variance8x8_neon( + const unsigned char *src, + int src_stride, + int xoffset, + int yoffset, + const unsigned char *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); + if (xoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8, + kWidth8, bilinear_taps_coeff[yoffset]); + } else if (yoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + bilinear_taps_coeff[xoffset]); + } else { + var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + bilinear_taps_coeff[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, + kWidth8, bilinear_taps_coeff[yoffset]); + } + return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); +} + diff --git a/source/libvpx/vp8/common/arm/reconintra_arm.c b/source/libvpx/vp8/common/arm/reconintra_arm.c deleted file mode 100644 index e55a33c..0000000 --- a/source/libvpx/vp8/common/arm/reconintra_arm.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vp8/common/blockd.h" -#include "vpx_mem/vpx_mem.h" - -#if HAVE_NEON_ASM -extern void vp8_build_intra_predictors_mby_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -extern void vp8_build_intra_predictors_mby_s_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -#endif diff --git a/source/libvpx/vp8/common/arm/variance_arm.c b/source/libvpx/vp8/common/arm/variance_arm.c index e3f7083..467a509 100644 --- a/source/libvpx/vp8/common/arm/variance_arm.c +++ b/source/libvpx/vp8/common/arm/variance_arm.c @@ -95,7 +95,7 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 #endif /* HAVE_MEDIA */ -#if HAVE_NEON_ASM +#if HAVE_NEON extern unsigned int vp8_sub_pixel_variance16x16_neon_func ( diff --git a/source/libvpx/vp8/common/onyx.h b/source/libvpx/vp8/common/onyx.h index b05ad14..d48c4fe 100644 --- a/source/libvpx/vp8/common/onyx.h +++ b/source/libvpx/vp8/common/onyx.h @@ -224,7 +224,7 @@ extern "C" int arnr_strength; int arnr_type; - struct vpx_fixed_buf two_pass_stats_in; + vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; vp8e_tuning tuning; diff --git a/source/libvpx/vp8/common/rtcd_defs.pl b/source/libvpx/vp8/common/rtcd_defs.pl index 204cbf0..a90c876 100644 --- a/source/libvpx/vp8/common/rtcd_defs.pl +++ b/source/libvpx/vp8/common/rtcd_defs.pl @@ -38,15 +38,13 @@ $vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6; $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/; +specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/; $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6; -$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon; $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/; +specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/; $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6; -$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon; $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; # @@ -58,9 +56,8 @@ $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6; $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2; add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/; +specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/; $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6; -$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon; $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2; add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; @@ -69,19 +66,18 @@ $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6; $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2; add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/; +specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/; $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6; -$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon; $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2; add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/; +specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/; $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx; $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6; -$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon; +$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/; @@ -92,12 +88,12 @@ $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6; $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/; +specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/; $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx; $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6; -$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon; +$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/; @@ -153,11 +149,10 @@ $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"; -specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/; -#TODO: fix assembly for neon +specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/; add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"; -specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/; +specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/; add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"; specialize qw/vp8_intra4x4_predict media/; @@ -446,14 +441,12 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/; +specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/; $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6; -$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/; +specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/; $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; -$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_walsh4x4 sse2 media neon/; @@ -537,13 +530,6 @@ if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { } # -# Pick Loopfilter -# -add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; -specialize qw/vp8_yv12_copy_partial_frame neon_asm/; -$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon; - -# # Denoiser filter # if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") { @@ -551,7 +537,6 @@ if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") { specialize qw/vp8_denoiser_filter sse2 neon/; add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising"; specialize qw/vp8_denoiser_filter_uv sse2 neon/; - } # End of encoder only functions diff --git a/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c deleted file mode 100644 index ec8071e..0000000 --- a/source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/loopfilter.h" -#include "vpx_scale/yv12config.h" - -extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr, - unsigned char *src_ptr, - int sz); - - -void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) -{ - unsigned char *src_y, *dst_y; - int yheight; - int ystride; - int yoffset; - int linestocopy; - - yheight = src_ybc->y_height; - ystride = src_ybc->y_stride; - - /* number of MB rows to use in partial filtering */ - linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION; - linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ - - /* Copy extra 4 so that full filter context is available if filtering done - * on the copied partial frame and not original. Partial filter does mb - * filtering for top row also, which can modify3 pixels above. - */ - linestocopy += 4; - /* partial image starts at ~middle of frame (macroblock border) */ - yoffset = ystride * (((yheight >> 5) * 16) - 4); - src_y = src_ybc->y_buffer + yoffset; - dst_y = dst_ybc->y_buffer + yoffset; - - vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy); -} diff --git a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm deleted file mode 100644 index 5ea8dd8..0000000 --- a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm +++ /dev/null @@ -1,221 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_fdct4x4_neon| - EXPORT |vp8_short_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - - - ALIGN 16 ; enable use of @128 bit aligned loads -coeff - DCW 5352, 5352, 5352, 5352 - DCW 2217, 2217, 2217, 2217 - DCD 14500, 14500, 14500, 14500 - DCD 7500, 7500, 7500, 7500 - DCD 12000, 12000, 12000, 12000 - DCD 51000, 51000, 51000, 51000 - -;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_neon| PROC - - ; Part one - vld1.16 {d0}, [r0@64], r2 - adr r12, coeff - vld1.16 {d1}, [r0@64], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {d2}, [r0@64], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {d3}, [r0@64], r2 - - ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] - vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] - vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] - - vshl.s16 q2, q2, #3 ; (a1, b1) << 3 - vshl.s16 q3, q3, #3 ; (c1, d1) << 3 - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 - vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 - - vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 - vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 - vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 - - vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - - ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vmov.s16 d26, #7 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] - vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] - vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] - vadd.s16 d4, d4, d26 ; a1 + 7 - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 - vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 - - vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 - vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 - - vceq.s16 d4, d7, #0 - - vshr.s16 d0, d0, #4 - vshr.s16 d2, d2, #4 - - vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 - - vmvn d4, d4 - vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 - vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) - vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - -;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -|vp8_short_fdct8x4_neon| PROC - - ; Part one - - vld1.16 {q0}, [r0@128], r2 - adr r12, coeff - vld1.16 {q1}, [r0@128], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {q2}, [r0@128], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {q3}, [r0@128], r2 - - ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] - vtrn.32 q0, q2 ; [A0|B0] - vtrn.32 q1, q3 ; [A1|B1] - vtrn.16 q0, q1 ; [A2|B2] - vtrn.16 q2, q3 ; [A3|B3] - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] - vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] - vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - - vshl.s16 q11, q11, #3 ; a1 << 3 - vshl.s16 q12, q12, #3 ; b1 << 3 - vshl.s16 q13, q13, #3 ; c1 << 3 - vshl.s16 q14, q14, #3 ; d1 << 3 - - vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 - vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 - - vmov.s16 q11, q9 ; 14500 - vmov.s16 q12, q10 ; 7500 - - vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 - vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 - vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 - vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 - - vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 - vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 - - vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 - vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 - - ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] - vtrn.32 q0, q2 ; q0=[A0 | B0] - vtrn.32 q1, q3 ; q1=[A4 | B4] - vtrn.16 q0, q1 ; q2=[A8 | B8] - vtrn.16 q2, q3 ; q3=[A12|B12] - - vmov.s16 q15, #7 - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] - vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] - vadd.s16 q11, q11, q15 ; a1 + 7 - vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] - - vadd.s16 q0, q11, q12 ; a1 + b1 + 7 - vsub.s16 q1, q11, q12 ; a1 - b1 + 7 - - vmov.s16 q11, q9 ; 12000 - vmov.s16 q12, q10 ; 51000 - - vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 - vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 - vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 - vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 - - - vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 - vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 - vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 - vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 - - vceq.s16 q14, q14, #0 - - vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 - vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 - - vmvn q14, q14 - - vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) - - vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) - - vst1.16 {q0, q1}, [r1@128]! ; block A - vst1.16 {q2, q3}, [r1@128]! ; block B - - bx lr - - ENDP - - END - diff --git a/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c new file mode 100644 index 0000000..391e5f9 --- /dev/null +++ b/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_fdct4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, dEmptys16; + uint16x4_t d4u16; + int16x8_t q0s16, q1s16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + q11s32 = vdupq_n_s32(12000); + q12s32 = vdupq_n_s32(51000); + + // Part one + pitch >>= 1; + d0s16 = vld1_s16(input); + input += pitch; + d1s16 = vld1_s16(input); + input += pitch; + d2s16 = vld1_s16(input); + input += pitch; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d4s16 = vshl_n_s16(d4s16, 3); + d5s16 = vshl_n_s16(d5s16, 3); + d6s16 = vshl_n_s16(d6s16, 3); + d7s16 = vshl_n_s16(d7s16, 3); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q9s32 = vmlal_s16(q9s32, d7s16, d16s16); + q10s32 = vmlal_s16(q10s32, d7s16, d17s16); + q9s32 = vmlal_s16(q9s32, d6s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 12); + d3s16 = vshrn_n_s32(q10s32, 12); + + // Part two + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d26s16 = vdup_n_s16(7); + d4s16 = vadd_s16(d4s16, d26s16); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q11s32 = vmlal_s16(q11s32, d7s16, d16s16); + q12s32 = vmlal_s16(q12s32, d7s16, d17s16); + + dEmptys16 = vdup_n_s16(0); + d4u16 = vceq_s16(d7s16, dEmptys16); + + d0s16 = vshr_n_s16(d0s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + + q11s32 = vmlal_s16(q11s32, d6s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); + + d4u16 = vmvn_u16(d4u16); + d1s16 = vshrn_n_s32(q11s32, 16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); + d3s16 = vshrn_n_s32(q12s32, 16); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} + +void vp8_short_fdct8x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; + uint16x4_t d28u16, d29u16; + uint16x8_t q14u16; + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x8x2_t v2tmp0, v2tmp1; + int32x4x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + + // Part one + pitch >>= 1; + q0s16 = vld1q_s16(input); + input += pitch; + q1s16 = vld1q_s16(input); + input += pitch; + q2s16 = vld1q_s16(input); + input += pitch; + q3s16 = vld1q_s16(input); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q11s16 = vshlq_n_s16(q11s16, 3); + q12s16 = vshlq_n_s16(q12s16, 3); + q13s16 = vshlq_n_s16(q13s16, 3); + q14s16 = vshlq_n_s16(q14s16, 3); + + q0s16 = vaddq_s16(q11s16, q12s16); + q2s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d2s16 = vshrn_n_s32(q9s32, 12); + d6s16 = vshrn_n_s32(q10s32, 12); + d3s16 = vshrn_n_s32(q11s32, 12); + d7s16 = vshrn_n_s32(q12s32, 12); + q1s16 = vcombine_s16(d2s16, d3s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + // Part two + q9s32 = vdupq_n_s32(12000); + q10s32 = vdupq_n_s32(51000); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q15s16 = vdupq_n_s16(7); + q11s16 = vaddq_s16(q11s16, q15s16); + q0s16 = vaddq_s16(q11s16, q12s16); + q1s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d2s16 = vget_low_s16(q1s16); + d3s16 = vget_high_s16(q1s16); + + d0s16 = vshr_n_s16(d0s16, 4); + d4s16 = vshr_n_s16(d1s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + d6s16 = vshr_n_s16(d3s16, 4); + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 16); + d3s16 = vshrn_n_s32(q10s32, 16); + d5s16 = vshrn_n_s32(q11s32, 16); + d7s16 = vshrn_n_s32(q12s32, 16); + + qEmptys16 = vdupq_n_s16(0); + q14u16 = vceqq_s16(q14s16, qEmptys16); + q14u16 = vmvnq_u16(q14u16); + + d28u16 = vget_low_u16(q14u16); + d29u16 = vget_high_u16(q14u16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); + d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + vst1q_s16(output + 16, q2s16); + vst1q_s16(output + 24, q3s16); + return; +} diff --git a/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm deleted file mode 100644 index d219e2d..0000000 --- a/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm +++ /dev/null @@ -1,72 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_memcpy_partial_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;========================================= -;this is not a full memcpy function!!! -;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr, -; int sz); -|vp8_memcpy_partial_neon| PROC - vpush {d8-d15} - ;pld [r1] ;preload pred data - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - mov r12, r2, lsr #8 ;copy 256 bytes data at one time - -memcpy_neon_loop - vld1.8 {q0, q1}, [r1]! ;load src data - subs r12, r12, #1 - vld1.8 {q2, q3}, [r1]! - vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr - vld1.8 {q4, q5}, [r1]! - vst1.8 {q2, q3}, [r0]! - vld1.8 {q6, q7}, [r1]! - vst1.8 {q4, q5}, [r0]! - vld1.8 {q8, q9}, [r1]! - vst1.8 {q6, q7}, [r0]! - vld1.8 {q10, q11}, [r1]! - vst1.8 {q8, q9}, [r0]! - vld1.8 {q12, q13}, [r1]! - vst1.8 {q10, q11}, [r0]! - vld1.8 {q14, q15}, [r1]! - vst1.8 {q12, q13}, [r0]! - vst1.8 {q14, q15}, [r0]! - - ;pld [r1] ;preload pred data -- need to adjust for real device - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - bne memcpy_neon_loop - - ands r3, r2, #0xff ;extra copy - beq done_copy_neon_loop - -extra_copy_neon_loop - vld1.8 {q0}, [r1]! ;load src data - subs r3, r3, #16 - vst1.8 {q0}, [r0]! - bne extra_copy_neon_loop - -done_copy_neon_loop - vpop {d8-d15} - bx lr - ENDP - - END diff --git a/source/libvpx/vp8/encoder/denoising.c b/source/libvpx/vp8/encoder/denoising.c index 2da0d8c..0c98eb1 100644 --- a/source/libvpx/vp8/encoder/denoising.c +++ b/source/libvpx/vp8/encoder/denoising.c @@ -413,9 +413,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, denoiser->nmse_source_diff = 0; denoiser->nmse_source_diff_count = 0; // TODO(marpan): Adjust thresholds, including effect on resolution. - denoiser->threshold_aggressive_mode = 40; + denoiser->threshold_aggressive_mode = 35; if (width * height > 640 * 480) - denoiser->threshold_aggressive_mode = 180; + denoiser->threshold_aggressive_mode = 150; + else if (width * height > 1280 * 720) + denoiser->threshold_aggressive_mode = 1400; return 0; } diff --git a/source/libvpx/vp8/encoder/onyx_if.c b/source/libvpx/vp8/encoder/onyx_if.c index 38b8999..74e75c4 100644 --- a/source/libvpx/vp8/encoder/onyx_if.c +++ b/source/libvpx/vp8/encoder/onyx_if.c @@ -3293,6 +3293,7 @@ static void update_reference_frames(VP8_COMP *cpi) } +#if CONFIG_TEMPORAL_DENOISING static void process_denoiser_mode_change(VP8_COMP *cpi) { const VP8_COMMON *const cm = &cpi->common; int i, j; @@ -3399,6 +3400,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { cpi->denoiser.nmse_source_diff_count = 0; } } +#endif void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { diff --git a/source/libvpx/vp8/encoder/pickinter.c b/source/libvpx/vp8/encoder/pickinter.c index 8dd1881..43f8957 100644 --- a/source/libvpx/vp8/encoder/pickinter.c +++ b/source/libvpx/vp8/encoder/pickinter.c @@ -487,6 +487,7 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; int_mv mv = x->e_mbd.mode_info_context->mbmi.mv; int this_rd; + int denoise_aggressive = 0; /* Exit early and don't compute the distortion if this macroblock * is marked inactive. */ if (cpi->active_map_enabled && x->active_ptr[0] == 0) @@ -505,10 +506,17 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2); +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + denoise_aggressive = + (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0; + } +#endif + // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame. if (this_mode == ZEROMV && x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME && - cpi->closest_reference_frame == LAST_FRAME) + (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) { this_rd = ((int64_t)this_rd) * rd_adj / 100; } diff --git a/source/libvpx/vp8/encoder/picklpf.c b/source/libvpx/vp8/encoder/picklpf.c index 250d04c..f0c8f28 100644 --- a/source/libvpx/vp8/encoder/picklpf.c +++ b/source/libvpx/vp8/encoder/picklpf.c @@ -23,8 +23,8 @@ extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) +static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { unsigned char *src_y, *dst_y; int yheight; @@ -173,7 +173,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) /* Get the err using the previous frame's filter value. */ /* Copy the unfiltered / processed recon buffer to the new buffer */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); best_err = calc_partial_ssl_err(sd, cm->frame_to_show); @@ -184,7 +184,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) while (filt_val >= min_filter_level) { /* Apply the loop filter */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); /* Get the err for filtered frame */ @@ -214,7 +214,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) while (filt_val < max_filter_level) { /* Apply the loop filter */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); diff --git a/source/libvpx/vp8/vp8_common.mk b/source/libvpx/vp8/vp8_common.mk index 6db031f..9b11c0d 100644 --- a/source/libvpx/vp8/vp8_common.mk +++ b/source/libvpx/vp8/vp8_common.mk @@ -155,30 +155,25 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) -# common (neon) -#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) -#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_0_2x_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_full_2x_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) - # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/source/libvpx/vp8/vp8_cx_iface.c b/source/libvpx/vp8/vp8_cx_iface.c index 2f394ef..b1b079c 100644 --- a/source/libvpx/vp8/vp8_cx_iface.c +++ b/source/libvpx/vp8/vp8_cx_iface.c @@ -14,6 +14,7 @@ #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" +#include "vpx_mem/vpx_mem.h" #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" @@ -39,40 +40,28 @@ struct vp8_extracfg }; -struct extraconfig_map -{ - int usage; - struct vp8_extracfg cfg; -}; - -static const struct extraconfig_map extracfg_map[] = -{ - { - 0, - { - NULL, +static struct vp8_extracfg default_extracfg = { + NULL, #if !(CONFIG_REALTIME_ONLY) - 0, /* cpu_used */ + 0, /* cpu_used */ #else - 4, /* cpu_used */ + 4, /* cpu_used */ #endif - 0, /* enable_auto_alt_ref */ - 0, /* noise_sensitivity */ - 0, /* Sharpness */ - 0, /* static_thresh */ + 0, /* enable_auto_alt_ref */ + 0, /* noise_sensitivity */ + 0, /* Sharpness */ + 0, /* static_thresh */ #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) - VP8_EIGHT_TOKENPARTITION, + VP8_EIGHT_TOKENPARTITION, #else - VP8_ONE_TOKENPARTITION, /* token_partitions */ + VP8_ONE_TOKENPARTITION, /* token_partitions */ #endif - 0, /* arnr_max_frames */ - 3, /* arnr_strength */ - 3, /* arnr_type*/ - 0, /* tuning*/ - 10, /* cq_level */ - 0, /* rc_max_intra_bitrate_pct */ - } - } + 0, /* arnr_max_frames */ + 3, /* arnr_strength */ + 3, /* arnr_type*/ + 0, /* tuning*/ + 10, /* cq_level */ + 0, /* rc_max_intra_bitrate_pct */ }; struct vpx_codec_alg_priv @@ -631,26 +620,21 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_CODEC_OK; - struct vpx_codec_alg_priv *priv; - vpx_codec_enc_cfg_t *cfg; - unsigned int i; - struct VP8_COMP *optr; vp8_rtcd(); if (!ctx->priv) { - priv = calloc(1, sizeof(struct vpx_codec_alg_priv)); + struct vpx_codec_alg_priv *priv = + (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv)); if (!priv) { return VPX_CODEC_MEM_ERROR; } - ctx->priv = &priv->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->alg_priv = priv; + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; if (ctx->config.enc) @@ -658,21 +642,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, /* Update the reference to the config structure to an * internal copy. */ - ctx->priv->alg_priv->cfg = *ctx->config.enc; - ctx->config.enc = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; } - cfg = &ctx->priv->alg_priv->cfg; - - /* Select the extra vp8 configuration table based on the current - * usage value. If the current usage value isn't found, use the - * values for usage case 0. - */ - for (i = 0; - extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - i++); - - priv->vp8_cfg = extracfg_map[i].cfg; + priv->vp8_cfg = default_extracfg; priv->vp8_cfg.pkt_list = &priv->pkt_list.head; priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; @@ -695,17 +669,10 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, if (!res) { - set_vp8e_config(&ctx->priv->alg_priv->oxcf, - ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg, - mr_cfg); - - optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf); - - if (!optr) + set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg); + priv->cpi = vp8_create_compressor(&priv->oxcf); + if (!priv->cpi) res = VPX_CODEC_MEM_ERROR; - else - ctx->priv->alg_priv->cpi = optr; } } @@ -726,7 +693,7 @@ static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) free(ctx->cx_data); vp8_remove_compressor(&ctx->cpi); - free(ctx); + vpx_free(ctx); return VPX_CODEC_OK; } @@ -1278,6 +1245,9 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 320, /* g_width */ 240, /* g_height */ + VPX_BITS_8, /* g_bit_depth */ + 8, /* g_input_bit_depth */ + {1, 30}, /* g_timebase */ 0, /* g_error_resilient */ @@ -1346,10 +1316,10 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ { - NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ - NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ - NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ - NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, /* vpx_codec_peek_si_fn_t peek_si; */ + NULL, /* vpx_codec_get_si_fn_t get_si; */ + NULL, /* vpx_codec_decode_fn_t decode; */ + NULL, /* vpx_codec_frame_get_fn_t frame_get; */ }, { 1, /* 1 cfg map */ @@ -1357,7 +1327,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ vp8e_set_config, - NOT_IMPLEMENTED, + NULL, vp8e_get_preview, vp8e_mr_alloc_mem, } /* encoder functions */ diff --git a/source/libvpx/vp8/vp8_dx_iface.c b/source/libvpx/vp8/vp8_dx_iface.c index 0deda50..3ab8ed0 100644 --- a/source/libvpx/vp8/vp8_dx_iface.c +++ b/source/libvpx/vp8/vp8_dx_iface.c @@ -80,29 +80,30 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_ static void vp8_init_ctx(vpx_codec_ctx_t *ctx) { - ctx->priv = - (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t)); - vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t)); - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv; - ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); - ctx->priv->alg_priv->decrypt_cb = NULL; - ctx->priv->alg_priv->decrypt_state = NULL; - ctx->priv->alg_priv->flushed = 0; + vpx_codec_alg_priv_t *priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; + priv->si.sz = sizeof(priv->si); + priv->decrypt_cb = NULL; + priv->decrypt_state = NULL; + priv->flushed = 0; + if (ctx->config.dec) { /* Update the reference to the config structure to an internal copy. */ - ctx->priv->alg_priv->cfg = *ctx->config.dec; - ctx->config.dec = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; } } static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data) { - vpx_codec_err_t res = VPX_CODEC_OK; + vpx_codec_err_t res = VPX_CODEC_OK; + vpx_codec_alg_priv_t *priv = NULL; (void) data; vp8_rtcd(); @@ -114,29 +115,30 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, if (!ctx->priv) { vp8_init_ctx(ctx); + priv = (vpx_codec_alg_priv_t *)ctx->priv; /* initialize number of fragments to zero */ - ctx->priv->alg_priv->fragments.count = 0; + priv->fragments.count = 0; /* is input fragments enabled? */ - ctx->priv->alg_priv->fragments.enabled = - (ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_INPUT_FRAGMENTS); + priv->fragments.enabled = + (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); /*post processing level initialized to do nothing */ } + else + { + priv = (vpx_codec_alg_priv_t *)ctx->priv; + } - ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = - (ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_FRAME_THREADING); + priv->yv12_frame_buffers.use_frame_threads = + (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING); /* for now, disable frame threading */ - ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0; + priv->yv12_frame_buffers.use_frame_threads = 0; - if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads && - (( ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_ERROR_CONCEALMENT) - || ( ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_INPUT_FRAGMENTS) ) ) + if (priv->yv12_frame_buffers.use_frame_threads && + ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) || + (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) { /* row-based threading, error concealment, and input fragments will * not be supported when using frame-based threading */ @@ -814,15 +816,15 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp8_decode, /* vpx_codec_decode_fn_t decode; */ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ - NOT_IMPLEMENTED, + NULL, }, { /* encoder functions */ 0, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED + NULL, + NULL, + NULL, + NULL, + NULL, + NULL } }; diff --git a/source/libvpx/vp8/vp8cx_arm.mk b/source/libvpx/vp8/vp8cx_arm.mk index 0b3eed0..551271e 100644 --- a/source/libvpx/vp8/vp8cx_arm.mk +++ b/source/libvpx/vp8/vp8cx_arm.mk @@ -36,11 +36,9 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon # encoder VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM) -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c diff --git a/source/libvpx/vp9/common/vp9_alloccommon.c b/source/libvpx/vp9/common/vp9_alloccommon.c index c65e008..21ae8d5 100644 --- a/source/libvpx/vp9/common/vp9_alloccommon.c +++ b/source/libvpx/vp9/common/vp9_alloccommon.c @@ -177,7 +177,11 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { for (i = 0; i < FRAME_BUFFERS; ++i) { cm->frame_bufs[i].ref_count = 0; if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height, - ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0) + ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; } @@ -185,6 +189,9 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; #endif diff --git a/source/libvpx/vp9/common/vp9_common.h b/source/libvpx/vp9/common/vp9_common.h index 2788e66..5587192 100644 --- a/source/libvpx/vp9/common/vp9_common.h +++ b/source/libvpx/vp9/common/vp9_common.h @@ -64,6 +64,11 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } +#if CONFIG_VP9_HIGHBITDEPTH +#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 )) +#endif // CONFIG_VP9_HIGHBITDEPTH + #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) do { \ lval = (expr); \ diff --git a/source/libvpx/vp9/common/vp9_enums.h b/source/libvpx/vp9/common/vp9_enums.h index d776313..8817fdb 100644 --- a/source/libvpx/vp9/common/vp9_enums.h +++ b/source/libvpx/vp9/common/vp9_enums.h @@ -40,12 +40,6 @@ typedef enum BITSTREAM_PROFILE { MAX_PROFILES } BITSTREAM_PROFILE; -typedef enum BIT_DEPTH { - BITS_8, - BITS_10, - BITS_12 -} BIT_DEPTH; - typedef enum BLOCK_SIZE { BLOCK_4X4, BLOCK_4X8, diff --git a/source/libvpx/vp9/common/vp9_mv.h b/source/libvpx/vp9/common/vp9_mv.h index 3eb7f9d..5d89da8 100644 --- a/source/libvpx/vp9/common/vp9_mv.h +++ b/source/libvpx/vp9/common/vp9_mv.h @@ -34,6 +34,14 @@ typedef struct mv32 { int32_t col; } MV32; +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, int max_row) { mv->col = clamp(mv->col, min_col, max_col); diff --git a/source/libvpx/vp9/common/vp9_onyxc_int.h b/source/libvpx/vp9/common/vp9_onyxc_int.h index 47aa563..637867a 100644 --- a/source/libvpx/vp9/common/vp9_onyxc_int.h +++ b/source/libvpx/vp9/common/vp9_onyxc_int.h @@ -84,6 +84,10 @@ typedef struct VP9Common { int subsampling_x; int subsampling_y; +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; // Marks if we need to use 16bit frame buffers. +#endif + YV12_BUFFER_CONFIG *frame_to_show; RefCntBuffer frame_bufs[FRAME_BUFFERS]; @@ -179,8 +183,8 @@ typedef struct VP9Common { unsigned int current_video_frame; BITSTREAM_PROFILE profile; - // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2 - BIT_DEPTH bit_depth; + // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. + vpx_bit_depth_t bit_depth; #if CONFIG_VP9_POSTPROC struct postproc_state postproc_state; diff --git a/source/libvpx/vp9/common/vp9_postproc.c b/source/libvpx/vp9/common/vp9_postproc.c index abda4e6..e4e6ce7 100644 --- a/source/libvpx/vp9/common/vp9_postproc.c +++ b/source/libvpx/vp9/common/vp9_postproc.c @@ -366,6 +366,9 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise, unsigned int width, unsigned int height, int pitch) { unsigned int i, j; + // TODO(jbb): why does simd code use both but c doesn't, normalize and + // fix.. + (void) bothclamp; for (i = 0; i < height; i++) { uint8_t *pos = start + i * pitch; char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT diff --git a/source/libvpx/vp9/common/vp9_reconintra.c b/source/libvpx/vp9/common/vp9_reconintra.c index 403e105..471929a 100644 --- a/source/libvpx/vp9/common/vp9_reconintra.c +++ b/source/libvpx/vp9/common/vp9_reconintra.c @@ -9,11 +9,9 @@ */ #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" -#include "vpx_ports/vpx_once.h" - -#include "./vp9_rtcd.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" @@ -292,32 +290,32 @@ intra_pred_allsizes(dc) typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -static intra_pred_fn pred[INTRA_MODES][4]; -static intra_pred_fn dc_pred[2][2][4]; - -static void init_intra_pred_fn_ptrs(void) { -#define intra_pred_allsizes(l, type) \ - l[0] = vp9_##type##_predictor_4x4; \ - l[1] = vp9_##type##_predictor_8x8; \ - l[2] = vp9_##type##_predictor_16x16; \ - l[3] = vp9_##type##_predictor_32x32 - - intra_pred_allsizes(pred[V_PRED], v); - intra_pred_allsizes(pred[H_PRED], h); - intra_pred_allsizes(pred[D207_PRED], d207); - intra_pred_allsizes(pred[D45_PRED], d45); - intra_pred_allsizes(pred[D63_PRED], d63); - intra_pred_allsizes(pred[D117_PRED], d117); - intra_pred_allsizes(pred[D135_PRED], d135); - intra_pred_allsizes(pred[D153_PRED], d153); - intra_pred_allsizes(pred[TM_PRED], tm); - - intra_pred_allsizes(dc_pred[0][0], dc_128); - intra_pred_allsizes(dc_pred[0][1], dc_top); - intra_pred_allsizes(dc_pred[1][0], dc_left); - intra_pred_allsizes(dc_pred[1][1], dc); - -#undef intra_pred_allsizes +static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; +static intra_pred_fn dc_pred[2][2][TX_SIZES]; + +void vp9_init_intra_predictors() { +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = vp9_##type##_predictor_4x4; \ + p[TX_8X8] = vp9_##type##_predictor_8x8; \ + p[TX_16X16] = vp9_##type##_predictor_16x16; \ + p[TX_32X32] = vp9_##type##_predictor_32x32 + + INIT_ALL_SIZES(pred[V_PRED], v); + INIT_ALL_SIZES(pred[H_PRED], h); + INIT_ALL_SIZES(pred[D207_PRED], d207); + INIT_ALL_SIZES(pred[D45_PRED], d45); + INIT_ALL_SIZES(pred[D63_PRED], d63); + INIT_ALL_SIZES(pred[D117_PRED], d117); + INIT_ALL_SIZES(pred[D135_PRED], d135); + INIT_ALL_SIZES(pred[D153_PRED], d153); + INIT_ALL_SIZES(pred[TM_PRED], tm); + + INIT_ALL_SIZES(dc_pred[0][0], dc_128); + INIT_ALL_SIZES(dc_pred[0][1], dc_top); + INIT_ALL_SIZES(dc_pred[1][0], dc_left); + INIT_ALL_SIZES(dc_pred[1][1], dc); + +#undef INIT_ALL_SIZES } static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, @@ -343,8 +341,6 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, // 129 G H .. S T T T T T // .. - once(init_intra_pred_fn_ptrs); - // Get current frame pointer, width and height. if (plane == 0) { frame_width = xd->cur_buf->y_width; diff --git a/source/libvpx/vp9/common/vp9_reconintra.h b/source/libvpx/vp9/common/vp9_reconintra.h index d09d2a1..845f3bc 100644 --- a/source/libvpx/vp9/common/vp9_reconintra.h +++ b/source/libvpx/vp9/common/vp9_reconintra.h @@ -18,6 +18,8 @@ extern "C" { #endif +void vp9_init_intra_predictors(); + void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, TX_SIZE tx_size, PREDICTION_MODE mode, const uint8_t *ref, int ref_stride, diff --git a/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/source/libvpx/vp9/common/vp9_rtcd_defs.pl index 92f9318..667e057 100644 --- a/source/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/source/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -268,7 +268,7 @@ $vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon; # if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit"; -specialize qw/vp9_mbpost_proc_down mmx sse2/; +specialize qw/vp9_mbpost_proc_down sse2/; $vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm; add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit"; @@ -276,23 +276,14 @@ specialize qw/vp9_mbpost_proc_across_ip sse2/; $vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm; add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"; -specialize qw/vp9_post_proc_down_and_across mmx sse2/; +specialize qw/vp9_post_proc_down_and_across sse2/; $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm; add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; -specialize qw/vp9_plane_add_noise mmx sse2/; +specialize qw/vp9_plane_add_noise sse2/; $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; } -add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; -specialize qw/vp9_blend_mb_inner/; - -add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; -specialize qw/vp9_blend_mb_outer/; - -add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; -specialize qw/vp9_blend_b/; - # # Sub Pixel Filters # @@ -420,19 +411,19 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; +specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc"; add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_get8x8var neon/, "$sse2_x86inc"; add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; @@ -444,7 +435,7 @@ add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance4x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -545,16 +536,16 @@ add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_str specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_sad16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc"; +specialize qw/vp9_sad8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad8x4/, "$sse2_x86inc"; @@ -563,7 +554,7 @@ add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_strid specialize qw/vp9_sad4x8/, "$sse_x86inc"; add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc"; +specialize qw/vp9_sad4x4/, "$sse_x86inc"; add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc"; @@ -693,19 +684,19 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, cons specialize qw/vp9_sad4x4x4d sse/; add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc"; +specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x16/; +specialize qw/vp9_mse8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x8/; +specialize qw/vp9_mse16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x8/; +specialize qw/vp9_mse8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; -specialize qw/vp9_get_mb_ss mmx sse2/; +specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; # ENCODEMB INVOKE add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"; diff --git a/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm b/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm deleted file mode 100644 index 5b8deef..0000000 --- a/source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm +++ /dev/null @@ -1,533 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - -;void vp9_post_proc_down_and_across_mmx -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp9_post_proc_down_and_across_mmx) PRIVATE -sym(vp9_post_proc_down_and_across_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movq mm0, [GLOBAL(rd)] - sub rsp, 8 - movq [rsp], mm0 -%define RD [rsp] -%else -%define RD [GLOBAL(rd)] -%endif - - push rbx - lea rbx, [GLOBAL(Blur)] - movd mm2, dword ptr arg(6) ;flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps - movq mm3, [rsi] ; mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] ; mm6 = kernel 3 taps - movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 - pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = r0 p0..p3 - psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers - movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - neg rax - movq mm6, [rbx ] ; kernel 0 taps - movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] ; kernel 1 taps - movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - - movd [rdi], mm1 ; - neg rax ; pitch is positive - - - add rsi, 4 - add rdi, 4 - add rdx, 4 - - cmp edx, dword ptr arg(5) ;cols - jl .nextcol - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - - push rax - xor rdx, rdx - mov rax, [rdi-4]; - -.acrossnextcol: - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; - movq mm4, [rdi+rdx] ; mm4 = p0..p7 - movq mm3, mm4 ; mm3 = p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] - psrlq mm4, 8 ; mm4 = p1..p7 - movq mm5, mm4 ; mm5 = p1..p7 - punpcklbw mm5, mm0 ; mm5 = p1..p4 - pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = p0..p3 - psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] - psrlq mm4, 8 ; mm4 = p2..p7 - movq mm5, mm4 ; mm5 = p2..p7 - punpcklbw mm5, mm0 ; mm5 = p2..p5 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - movq mm6, [rbx ] - movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 - movq mm5, mm4 ; mm5 = p-2..p5 - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] - psrlq mm4, 8 ; mm4 = p-1..p5 - punpcklbw mm4, mm0 ; mm4 = p-1..p2 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes - movd eax, mm1 - - add rdx, 4 - cmp edx, dword ptr arg(5) ;cols - jl .acrossnextcol; - - mov DWORD PTR [rdi+rdx-4], eax - pop rax - - ; done with this rwo - add rsi,rax ; next line - movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef RD - - -;void vp9_mbpost_proc_down_mmx(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp9_rv) -global sym(vp9_mbpost_proc_down_mmx) PRIVATE -sym(vp9_mbpost_proc_down_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 136 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax -%define flimit2 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp9_rv))] -%endif - - ;rows +=8; - add dword ptr arg(2), 8 - - ;for(c=0; c<cols; c+=4) -.loop_col: - mov rsi, arg(0) ;s - pxor mm0, mm0 ; - - movsxd rax, dword ptr arg(1) ;pitch ; - neg rax ; rax = -pitch - - lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] - neg rax - - - pxor mm5, mm5 - pxor mm6, mm6 ; - - pxor mm7, mm7 ; - mov rdi, rsi - - mov rcx, 15 ; - -.loop_initvar: - movd mm1, DWORD PTR [rdi]; - punpcklbw mm1, mm0 ; - - paddw mm5, mm1 ; - pmullw mm1, mm1 ; - - movq mm2, mm1 ; - punpcklwd mm1, mm0 ; - - punpckhwd mm2, mm0 ; - paddd mm6, mm1 ; - - paddd mm7, mm2 ; - lea rdi, [rdi+rax] ; - - dec rcx - jne .loop_initvar - ;save the var and sum - xor rdx, rdx -.loop_row: - movd mm1, DWORD PTR [rsi] ; [s-pitch*8] - movd mm2, DWORD PTR [rdi] ; [s+pitch*7] - - punpcklbw mm1, mm0 - punpcklbw mm2, mm0 - - paddw mm5, mm2 - psubw mm5, mm1 - - pmullw mm2, mm2 - movq mm4, mm2 - - punpcklwd mm2, mm0 - punpckhwd mm4, mm0 - - paddd mm6, mm2 - paddd mm7, mm4 - - pmullw mm1, mm1 - movq mm2, mm1 - - punpcklwd mm1, mm0 - psubd mm6, mm1 - - punpckhwd mm2, mm0 - psubd mm7, mm2 - - - movq mm3, mm6 - pslld mm3, 4 - - psubd mm3, mm6 - movq mm1, mm5 - - movq mm4, mm5 - pmullw mm1, mm1 - - pmulhw mm4, mm4 - movq mm2, mm1 - - punpcklwd mm1, mm4 - punpckhwd mm2, mm4 - - movq mm4, mm7 - pslld mm4, 4 - - psubd mm4, mm7 - - psubd mm3, mm1 - psubd mm4, mm2 - - psubd mm3, flimit2 - psubd mm4, flimit2 - - psrad mm3, 31 - psrad mm4, 31 - - packssdw mm3, mm4 - packsswb mm3, mm0 - - movd mm1, DWORD PTR [rsi+rax*8] - - movq mm2, mm1 - punpcklbw mm1, mm0 - - paddw mm1, mm5 - mov rcx, rdx - - and rcx, 127 -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - push rax - lea rax, [GLOBAL(sym(vp9_rv))] - movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] - pop rax -%elif ABI_IS_32BIT=0 - movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] -%else - movq mm4, [sym(vp9_rv) + rcx*2] -%endif - paddw mm1, mm4 - ;paddw xmm1, eight8s - psraw mm1, 4 - - packuswb mm1, mm0 - pand mm1, mm3 - - pandn mm3, mm2 - por mm1, mm3 - - and rcx, 15 - movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] - - mov rcx, rdx - sub rcx, 8 - - and rcx, 15 - movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] - - movd [rsi], mm1 - lea rsi, [rsi+rax] - - lea rdi, [rdi+rax] - add rdx, 1 - - cmp edx, dword arg(2) ;rows - jl .loop_row - - - add dword arg(0), 4 ; s += 4 - sub dword arg(3), 4 ; cols -= 4 - cmp dword arg(3), 0 - jg .loop_col - - add rsp, 136 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef flimit2 - - -;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int width, unsigned int height, int pitch) -global sym(vp9_plane_add_noise_mmx) PRIVATE -sym(vp9_plane_add_noise_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - ; we rely on the fact that the clamping vectors are stored contiguously - ; in black/white/both order. Note that we have to reload this here because - ; rdx could be trashed by rand() - mov rdx, arg(2) ; blackclamp - - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movq mm1,[rsi+rax] ; get the source - - psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise - paddusb mm1, [rdx+32] ;bothclamp - psubusb mm1, [rdx+16] ;whiteclamp - - movq mm2,[rdi+rax] ; get the noise for this line - paddb mm1,mm2 ; add it in - movq [rsi+rax],mm1 ; store the result - - add rax,8 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -Blur: - times 16 dw 16 - times 8 dw 64 - times 16 dw 16 - times 8 dw 0 - -rd: - times 4 dw 0x40 diff --git a/source/libvpx/vp9/decoder/vp9_decodeframe.c b/source/libvpx/vp9/decoder/vp9_decodeframe.c index a0fff45..a9c03f0 100644 --- a/source/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/source/libvpx/vp9/decoder/vp9_decodeframe.c @@ -330,6 +330,9 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (!vp9_is_valid_scale(&ref_buffer->sf)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid scale factors"); + if (ref_buffer->buf->corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Block reference is corrupt"); vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col, &ref_buffer->sf); xd->corrupted |= ref_buffer->buf->corrupted; @@ -627,11 +630,14 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { "Width and height beyond allowed size."); #endif if (cm->width != width || cm->height != height) { - const int new_rows = ALIGN_POWER_OF_TWO(height, - MI_SIZE_LOG2) >> MI_SIZE_LOG2; - const int new_cols = ALIGN_POWER_OF_TWO(width, - MI_SIZE_LOG2) >> MI_SIZE_LOG2; - if (calc_mi_size(new_rows) * calc_mi_size(new_cols) > cm->mi_alloc_size) { + const int new_mi_rows = + ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_mi_cols = + ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + + // Allocations in vp9_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { if (vp9_alloc_context_buffers(cm, width, height)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate context buffers"); @@ -652,7 +658,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { if (vp9_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -670,6 +680,10 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; width = buf->y_crop_width; height = buf->y_crop_height; + if (buf->corrupted) { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Frame reference is corrupt"); + } found = 1; break; } @@ -699,7 +713,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, if (vp9_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, cm->cb_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -812,6 +830,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, if (cm->lf.filter_level) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + // Be sure to sync as we might be resuming after a failed frame decode. + winterface->sync(&pbi->lf_worker); lf_data->frame_buffer = get_frame_new_buffer(cm); lf_data->cm = cm; vp9_copy(lf_data->planes, pbi->mb.plane); @@ -881,7 +901,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, pbi->mb.corrupted |= tile_data->xd.corrupted; } // Loopfilter one row. - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !pbi->mb.corrupted) { const int lf_start = mi_row - MI_BLOCK_SIZE; LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; @@ -904,7 +924,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, } // Loopfilter remaining rows in the frame. - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !pbi->mb.corrupted) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; winterface->sync(&pbi->lf_worker); lf_data->start = lf_data->stop; @@ -993,6 +1013,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { + winterface->sync(&pbi->tile_workers[n]); pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook; } @@ -1096,7 +1117,7 @@ BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { static void read_bitdepth_colorspace_sampling( VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { if (cm->profile >= PROFILE_2) - cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10; + cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10; cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); if (cm->color_space != SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range @@ -1140,6 +1161,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, "Invalid frame marker"); cm->profile = vp9_read_profile(rb); + if (cm->profile >= MAX_PROFILES) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Unsupported bitstream profile"); @@ -1398,7 +1420,7 @@ void vp9_decode_frame(VP9Decoder *pbi, if (!first_partition_size) { // showing a frame directly - *p_data_end = data + 1; + *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2); return; } @@ -1429,9 +1451,11 @@ void vp9_decode_frame(VP9Decoder *pbi, if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 && cm->frame_parallel_decoding_mode) { *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - // If multiple threads are used to decode tiles, then we use those threads - // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); + if (!xd->corrupted) { + // If multiple threads are used to decode tiles, then we use those threads + // to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); + } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); } diff --git a/source/libvpx/vp9/decoder/vp9_decoder.c b/source/libvpx/vp9/decoder/vp9_decoder.c index e79dcf3..9106b0d 100644 --- a/source/libvpx/vp9/decoder/vp9_decoder.c +++ b/source/libvpx/vp9/decoder/vp9_decoder.c @@ -25,6 +25,7 @@ #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/decoder/vp9_decodeframe.h" @@ -36,7 +37,9 @@ static void initialize_dec() { static int init_done = 0; if (!init_done) { + vp9_rtcd(); vp9_init_neighbors(); + vp9_init_intra_predictors(); init_done = 1; } } @@ -59,13 +62,12 @@ VP9Decoder *vp9_decoder_create() { cm->error.setjmp = 1; initialize_dec(); - vp9_rtcd(); - // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); cm->current_video_frame = 0; pbi->ready_for_new_data = 1; + cm->bit_depth = VPX_BITS_8; // vp9_init_dequantizer() is first called here. Add check in // frame_init_dequantizer() to avoid unnecessary calling of @@ -96,10 +98,8 @@ void vp9_decoder_remove(VP9Decoder *pbi) { } vpx_free(pbi->tile_workers); - if (pbi->num_tile_workers) { - const int sb_rows = - mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; - vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows); + if (pbi->num_tile_workers > 0) { + vp9_loop_filter_dealloc(&pbi->lf_row_sync); } vp9_remove_common(cm); diff --git a/source/libvpx/vp9/decoder/vp9_dthread.c b/source/libvpx/vp9/decoder/vp9_dthread.c index 5dda49a..b82ea6a 100644 --- a/source/libvpx/vp9/decoder/vp9_dthread.c +++ b/source/libvpx/vp9/decoder/vp9_dthread.c @@ -147,17 +147,8 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, // Allocate memory used in thread synchronization. // This always needs to be done even if frame_filter_level is 0. - if (!cm->current_video_frame || cm->last_height != cm->height) { - if (cm->last_height != cm->height) { - const int aligned_last_height = - ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2); - const int last_sb_rows = - mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >> - MI_BLOCK_SIZE_LOG2; - - vp9_loop_filter_dealloc(lf_sync, last_sb_rows); - } - + if (!lf_sync->sync_range || cm->last_height != cm->height) { + vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width); } @@ -227,19 +218,22 @@ static int get_sync_range(int width) { // Allocate memory for lf row synchronization void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, int width) { + lf_sync->rows = rows; #if CONFIG_MULTITHREAD - int i; + { + int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex_, - vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); - for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); - } + CHECK_MEM_ERROR(cm, lf_sync->mutex_, + vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[i], NULL); + } - CHECK_MEM_ERROR(cm, lf_sync->cond_, - vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - for (i = 0; i < rows; ++i) { - pthread_cond_init(&lf_sync->cond_[i], NULL); + CHECK_MEM_ERROR(cm, lf_sync->cond_, + vpx_malloc(sizeof(*lf_sync->cond_) * rows)); + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[i], NULL); + } } #endif // CONFIG_MULTITHREAD @@ -251,23 +245,19 @@ void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, } // Deallocate lf synchronization related mutex and data -void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { -#if !CONFIG_MULTITHREAD - (void)rows; -#endif // !CONFIG_MULTITHREAD - +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { if (lf_sync != NULL) { #if CONFIG_MULTITHREAD int i; if (lf_sync->mutex_ != NULL) { - for (i = 0; i < rows; ++i) { + for (i = 0; i < lf_sync->rows; ++i) { pthread_mutex_destroy(&lf_sync->mutex_[i]); } vpx_free(lf_sync->mutex_); } if (lf_sync->cond_ != NULL) { - for (i = 0; i < rows; ++i) { + for (i = 0; i < lf_sync->rows; ++i) { pthread_cond_destroy(&lf_sync->cond_[i]); } vpx_free(lf_sync->cond_); diff --git a/source/libvpx/vp9/decoder/vp9_dthread.h b/source/libvpx/vp9/decoder/vp9_dthread.h index 423bd88..8b02ef7 100644 --- a/source/libvpx/vp9/decoder/vp9_dthread.h +++ b/source/libvpx/vp9/decoder/vp9_dthread.h @@ -38,6 +38,7 @@ typedef struct VP9LfSyncData { // The optimal sync_range for different resolution and platform should be // determined by testing. Currently, it is chosen to be a power-of-2 number. int sync_range; + int rows; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -45,7 +46,7 @@ void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync, int rows, int width); // Deallocate loopfilter synchronization related mutex and data. -void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows); +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.c b/source/libvpx/vp9/encoder/vp9_bitstream.c index bdb1338..b605248 100644 --- a/source/libvpx/vp9/encoder/vp9_bitstream.c +++ b/source/libvpx/vp9/encoder/vp9_bitstream.c @@ -294,6 +294,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, vp9_write_token(w, vp9_switchable_interp_tree, cm->fc.switchable_interp_prob[ctx], &switchable_interp_encodings[mbmi->interp_filter]); + ++cpi->interp_filter_selected[0][mbmi->interp_filter]; } else { assert(mbmi->interp_filter == cm->interp_filter); } @@ -670,8 +671,6 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) { vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES]; vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; - vp9_clear_system_state(); - for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size) build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size], frame_coef_probs[tx_size]); @@ -998,8 +997,10 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, // Set "found" to 0 for temporal svc and for spatial svc key frame if (cpi->use_svc && - (cpi->svc.number_spatial_layers == 1 || - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) { + ((cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.rc_mode == VPX_CBR) || + (cpi->svc.number_spatial_layers > 1 && + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { found = 0; } vp9_wb_write_bit(wb, found); @@ -1045,8 +1046,8 @@ static void write_profile(BITSTREAM_PROFILE profile, static void write_bitdepth_colorspace_sampling( VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { if (cm->profile >= PROFILE_2) { - assert(cm->bit_depth > BITS_8); - vp9_wb_write_bit(wb, cm->bit_depth - BITS_10); + assert(cm->bit_depth > VPX_BITS_8); + vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1); } vp9_wb_write_literal(wb, cm->color_space, 3); if (cm->color_space != SRGB) { @@ -1083,7 +1084,16 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_bitdepth_colorspace_sampling(cm, wb); write_frame_size(cm, wb); } else { - if (!cm->show_frame) + // In spatial svc if it's not error_resilient_mode then we need to code all + // visible frames as invisible. But we need to keep the show_frame flag so + // that the publisher could know whether it is supposed to be visible. + // So we will code the show_frame flag as it is. Then code the intra_only + // bit here. This will make the bitstream incompatible. In the player we + // will change to show_frame flag to 0, then add an one byte frame with + // show_existing_frame flag which tells the decoder which frame we want to + // show. + if (!cm->show_frame || + (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)) vp9_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.h b/source/libvpx/vp9/encoder/vp9_bitstream.h index 8e82d1c..b488261 100644 --- a/source/libvpx/vp9/encoder/vp9_bitstream.h +++ b/source/libvpx/vp9/encoder/vp9_bitstream.h @@ -26,7 +26,7 @@ static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref && (!cpi->use_svc || // Add spatial svc base layer case here - (is_spatial_svc(cpi) && + (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 && cpi->svc.layer_context[0].gold_ref_idx >=0 && cpi->oxcf.ss_play_alternate[0])); diff --git a/source/libvpx/vp9/encoder/vp9_block.h b/source/libvpx/vp9/encoder/vp9_block.h index bd3b0fd..b726383 100644 --- a/source/libvpx/vp9/encoder/vp9_block.h +++ b/source/libvpx/vp9/encoder/vp9_block.h @@ -76,16 +76,12 @@ struct macroblock { int pred_mv_sad[MAX_REF_FRAMES]; int nmvjointcost[MV_JOINTS]; - int nmvcosts[2][MV_VALS]; int *nmvcost[2]; - int nmvcosts_hp[2][MV_VALS]; int *nmvcost_hp[2]; int **mvcost; int nmvjointsadcost[MV_JOINTS]; - int nmvsadcosts[2][MV_VALS]; int *nmvsadcost[2]; - int nmvsadcosts_hp[2][MV_VALS]; int *nmvsadcost_hp[2]; int **mvsadcost; @@ -116,9 +112,9 @@ struct macroblock { int quant_fp; // skip forward transform and quantization - int skip_txfm[MAX_MB_PLANE]; + uint8_t skip_txfm[MAX_MB_PLANE << 2]; - int64_t bsse[MAX_MB_PLANE]; + int64_t bsse[MAX_MB_PLANE << 2]; // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; diff --git a/source/libvpx/vp9/encoder/vp9_context_tree.h b/source/libvpx/vp9/encoder/vp9_context_tree.h index d60e6c3..236389b 100644 --- a/source/libvpx/vp9/encoder/vp9_context_tree.h +++ b/source/libvpx/vp9/encoder/vp9_context_tree.h @@ -33,7 +33,10 @@ typedef struct { int is_coded; int num_4x4_blk; int skip; - int skip_txfm[MAX_MB_PLANE]; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 0. + int skippable; + uint8_t skip_txfm[MAX_MB_PLANE << 2]; int best_mode_index; int hybrid_pred_diff; int comp_pred_diff; diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.c b/source/libvpx/vp9/encoder/vp9_denoiser.c index 90ea9cc..c4cf5ee 100644 --- a/source/libvpx/vp9/encoder/vp9_denoiser.c +++ b/source/libvpx/vp9/encoder/vp9_denoiser.c @@ -78,7 +78,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, - BLOCK_SIZE bs) { + BLOCK_SIZE bs, + int motion_magnitude) { int r, c; const uint8_t *sig_start = sig; const uint8_t *mc_avg_start = mc_avg; @@ -86,6 +87,19 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, int diff, adj, absdiff, delta; int adj_val[] = {3, 4, 6}; int total_adj = 0; + int shift_inc = 1; + + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc = 2; + } + adj_val[0] += shift_inc; + adj_val[1] += shift_inc; + adj_val[2] += shift_inc; + } // First attempt to apply a strong temporal denoising filter. for (r = 0; r < heights[bs]; ++r) { @@ -130,7 +144,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, // Otherwise, we try to dampen the filter if the delta is not too high. delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >> 8) + 1; - if (delta > delta_thresh(bs, increase_denoising)) { + + if (delta >= delta_thresh(bs, increase_denoising)) { return COPY_BLOCK; } @@ -145,11 +160,17 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, adj = delta; } if (diff > 0) { + // Diff positive means we made positive adjustment above + // (in first try/attempt), so now make negative adjustment to bring + // denoised signal down. avg[c] = MAX(0, avg[c] - adj); - total_adj += adj; + total_adj -= adj; } else { + // Diff negative means we made negative adjustment above + // (in first try/attempt), so now make positive adjustment to bring + // denoised signal up. avg[c] = MIN(UINT8_MAX, avg[c] + adj); - total_adj -= adj; + total_adj += adj; } } sig += sig_stride; @@ -185,7 +206,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, int increase_denoising, int mi_row, int mi_col, - PICK_MODE_CONTEXT *ctx + PICK_MODE_CONTEXT *ctx, + int *motion_magnitude ) { int mv_col, mv_row; int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; @@ -210,6 +232,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; + *motion_magnitude = mv_row * mv_row + mv_col * mv_col; + frame = ctx->best_reference_frame; // If the best reference frame uses inter-prediction and there is enough of a @@ -297,6 +321,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx) { + int motion_magnitude = 0; VP9_DENOISER_DECISION decision = FILTER_BLOCK; YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; @@ -307,13 +332,14 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, decision = perform_motion_compensation(denoiser, mb, bs, denoiser->increase_denoising, - mi_row, mi_col, ctx); + mi_row, mi_col, ctx, + &motion_magnitude); if (decision == FILTER_BLOCK) { decision = denoiser_filter(src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride, - 0, bs); + 0, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { @@ -370,8 +396,8 @@ void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->newmv_sse = UINT_MAX; } -void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi, - unsigned int sse, PREDICTION_MODE mode, +void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, + PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx) { // TODO(tkopp): Use both MVs if possible if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { @@ -388,13 +414,21 @@ void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi, } int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, - int ssx, int ssy, int border) { + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border) { int i, fail; assert(denoiser != NULL); for (i = 0; i < MAX_REF_FRAMES; ++i) { fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, - ssx, ssy, border); + ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border); if (fail) { vp9_denoiser_free(denoiser); return 1; @@ -405,7 +439,11 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, } fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, - ssx, ssy, border); + ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border); if (fail) { vp9_denoiser_free(denoiser); return 1; diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.h b/source/libvpx/vp9/encoder/vp9_denoiser.h index d93846f..a913add 100644 --- a/source/libvpx/vp9/encoder/vp9_denoiser.h +++ b/source/libvpx/vp9/encoder/vp9_denoiser.h @@ -18,6 +18,8 @@ extern "C" { #endif +#define MOTION_MAGNITUDE_THRESHOLD (8*3) + typedef enum vp9_denoiser_decision { COPY_BLOCK, FILTER_BLOCK @@ -42,12 +44,16 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); -void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi, +void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx); int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, - int ssx, int ssy, int border); + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border); void vp9_denoiser_free(VP9_DENOISER *denoiser); diff --git a/source/libvpx/vp9/encoder/vp9_encodeframe.c b/source/libvpx/vp9/encoder/vp9_encodeframe.c index 711354b..72ced05 100644 --- a/source/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/source/libvpx/vp9/encoder/vp9_encodeframe.c @@ -727,6 +727,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, p[i].eobs = ctx->eobs_pbuf[i][0]; } ctx->is_coded = 0; + ctx->skippable = 0; x->skip_recode = 0; // Set to zero to make sure we do not use the previous encoded frame stats @@ -1232,30 +1233,23 @@ static void set_source_var_based_partition(VP9_COMP *cpi, } } -static int is_background(VP9_COMP *cpi, const TileInfo *const tile, +static int is_background(const VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col) { - MACROBLOCK *x = &cpi->mb; - uint8_t *src, *pre; - int src_stride, pre_stride; - + // This assumes the input source frames are of the same dimension. const int row8x8_remaining = tile->mi_row_end - mi_row; const int col8x8_remaining = tile->mi_col_end - mi_col; - + const int x = mi_col * MI_SIZE; + const int y = mi_row * MI_SIZE; + const int src_stride = cpi->Source->y_stride; + const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x]; + const int pre_stride = cpi->Last_Source->y_stride; + const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x]; int this_sad = 0; int threshold = 0; - // This assumes the input source frames are of the same dimension. - src_stride = cpi->Source->y_stride; - src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride + - (mi_col * MI_SIZE); - pre_stride = cpi->Last_Source->y_stride; - pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride + - (mi_col * MI_SIZE); - if (row8x8_remaining >= MI_BLOCK_SIZE && col8x8_remaining >= MI_BLOCK_SIZE) { - this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, - pre, pre_stride); + this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride); threshold = (1 << 12); } else { int r, c; @@ -1266,8 +1260,7 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile, threshold = (row8x8_remaining * col8x8_remaining) << 6; } - x->in_static_area = (this_sad < 2 * threshold); - return x->in_static_area; + return this_sad < 2 * threshold; } static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8, @@ -2166,8 +2159,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); if (sum_rd < best_rd) { - int64_t stop_thresh = 4096; - int64_t stop_thresh_rd; + int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr; + int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr; best_rate = this_rate; best_dist = this_dist; @@ -2175,14 +2168,18 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - // Adjust threshold according to partition size. - stop_thresh >>= 8 - (b_width_log2(bsize) + + // Adjust dist breakout threshold according to the partition size. + dist_breakout_thr >>= 8 - (b_width_log2(bsize) + b_height_log2(bsize)); - stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh); - // If obtained distortion is very small, choose current partition - // and stop splitting. - if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) { + // If all y, u, v transform blocks in this partition are skippable, and + // the dist & rate are within the thresholds, the partition search is + // terminated for current branch of the partition search tree. + // The dist & rate thresholds are set to 0 at speed 0 to disable the + // early termination at that speed. + if (!x->e_mbd.lossless && + (ctx->skippable && best_dist < dist_breakout_thr && + best_rate < rate_breakout_thr)) { do_split = 0; do_rect = 0; } @@ -2606,8 +2603,6 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { static TX_MODE select_tx_mode(const VP9_COMP *cpi) { if (cpi->mb.e_mbd.lossless) return ONLY_4X4; - if (cpi->common.frame_type == KEY_FRAME) - return TX_MODE_SELECT; if (cpi->sf.tx_size_search_method == USE_LARGESTALL) return ALLOW_32X32; else if (cpi->sf.tx_size_search_method == USE_FULL_RD|| @@ -3119,7 +3114,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, break; case REFERENCE_PARTITION: if (sf->partition_check || - !is_background(cpi, tile, mi_row, mi_col)) { + !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) { set_modeinfo_offsets(cm, xd, mi_row, mi_col); auto_partition_range(cpi, tile, mi_row, mi_col, &sf->min_partition_size, @@ -3297,7 +3292,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cm->counts); vp9_zero(cpi->coef_counts); - vp9_zero(cpi->tx_stepdown_count); vp9_zero(rd_opt->comp_pred_diff); vp9_zero(rd_opt->filter_diff); vp9_zero(rd_opt->tx_select_diff); diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.c b/source/libvpx/vp9/encoder/vp9_encodemb.c index 8a737e1..6678450 100644 --- a/source/libvpx/vp9/encoder/vp9_encodemb.c +++ b/source/libvpx/vp9/encoder/vp9_encodemb.c @@ -476,20 +476,24 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, } if (!x->skip_recode) { - if (x->skip_txfm[plane] == 0) { - // full forward transform and quantization - if (x->quant_fp) - vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); - else - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); - } else if (x->skip_txfm[plane] == 2) { - // fast path forward transform and quantization - vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + if (max_txsize_lookup[plane_bsize] == tx_size) { + if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) { + // full forward transform and quantization + if (x->quant_fp) + vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + else + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { + // fast path forward transform and quantization + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; + return; + } } else { - // skip forward transform - p->eobs[block] = 0; - *a = *l = 0; - return; + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); } } diff --git a/source/libvpx/vp9/encoder/vp9_encodemv.c b/source/libvpx/vp9/encoder/vp9_encodemv.c index 9ad6db0..9d42a12 100644 --- a/source/libvpx/vp9/encoder/vp9_encodemv.c +++ b/source/libvpx/vp9/encoder/vp9_encodemv.c @@ -216,7 +216,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. - if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) { + if (cpi->sf.mv.auto_mv_step_size) { unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3; cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude); } diff --git a/source/libvpx/vp9/encoder/vp9_encoder.c b/source/libvpx/vp9/encoder/vp9_encoder.c index d27620c..2ca91b9 100644 --- a/source/libvpx/vp9/encoder/vp9_encoder.c +++ b/source/libvpx/vp9/encoder/vp9_encoder.c @@ -24,6 +24,7 @@ #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_tile_common.h" @@ -128,11 +129,13 @@ static void setup_frame(VP9_COMP *cpi) { } if (cm->frame_type == KEY_FRAME) { - if (!is_spatial_svc(cpi)) + if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; + vp9_zero(cpi->interp_filter_selected); } else { cm->fc = cm->frame_contexts[cm->frame_context_idx]; + vp9_zero(cpi->interp_filter_selected[0]); } } @@ -140,7 +143,9 @@ void vp9_initialize_enc() { static int init_done = 0; if (!init_done) { + vp9_rtcd(); vp9_init_neighbors(); + vp9_init_intra_predictors(); vp9_coef_tree_initialize(); vp9_tokenize_initialize(); vp9_init_me_luts(); @@ -167,6 +172,26 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->complexity_map); cpi->complexity_map = NULL; + vpx_free(cpi->nmvcosts[0]); + vpx_free(cpi->nmvcosts[1]); + cpi->nmvcosts[0] = NULL; + cpi->nmvcosts[1] = NULL; + + vpx_free(cpi->nmvcosts_hp[0]); + vpx_free(cpi->nmvcosts_hp[1]); + cpi->nmvcosts_hp[0] = NULL; + cpi->nmvcosts_hp[1] = NULL; + + vpx_free(cpi->nmvsadcosts[0]); + vpx_free(cpi->nmvsadcosts[1]); + cpi->nmvsadcosts[0] = NULL; + cpi->nmvsadcosts[1] = NULL; + + vpx_free(cpi->nmvsadcosts_hp[0]); + vpx_free(cpi->nmvsadcosts_hp[1]); + cpi->nmvsadcosts_hp[0] = NULL; + cpi->nmvsadcosts_hp[1] = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; @@ -212,8 +237,15 @@ static void save_coding_context(VP9_COMP *cpi) { // intended for use in a re-code loop in vp9_compress_frame where the // quantizer value is adjusted between loop iterations. vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); - vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts); - vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp); + + vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0], + MV_VALS * sizeof(*cpi->nmvcosts[0])); + vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1], + MV_VALS * sizeof(*cpi->nmvcosts[1])); + vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0], + MV_VALS * sizeof(*cpi->nmvcosts_hp[0])); + vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1], + MV_VALS * sizeof(*cpi->nmvcosts_hp[1])); vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); @@ -233,8 +265,15 @@ static void restore_coding_context(VP9_COMP *cpi) { // Restore key state variables to the snapshot state stored in the // previous call to vp9_save_coding_context. vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); - vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts); - vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp); + + vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], + MV_VALS * sizeof(*cc->nmvcosts[0])); + vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], + MV_VALS * sizeof(*cc->nmvcosts[1])); + vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0], + MV_VALS * sizeof(*cc->nmvcosts_hp[0])); + vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1], + MV_VALS * sizeof(*cc->nmvcosts_hp[1])); vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); @@ -386,27 +425,15 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) { } } - -static void set_speed_features(VP9_COMP *cpi) { -#if CONFIG_INTERNAL_STATS - int i; - for (i = 0; i < MAX_MODES; ++i) - cpi->mode_chosen_counts[i] = 0; -#endif - - vp9_set_speed_features(cpi); - - // Set rd thresholds based on mode and speed setting - vp9_set_rd_speed_thresholds(cpi); - vp9_set_rd_speed_thresholds_sub8x8(cpi); -} - static void alloc_raw_frame_buffers(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; const VP9EncoderConfig *oxcf = &cpi->oxcf; cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif oxcf->lag_in_frames); if (!cpi->lookahead) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -415,6 +442,9 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); @@ -432,6 +462,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); @@ -439,6 +472,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); @@ -446,6 +482,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); @@ -474,10 +513,13 @@ static void update_frame_size(VP9_COMP *cpi) { vp9_init_context_buffers(cm); init_macroblockd(cm, xd); - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); @@ -526,7 +568,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cpi->svc.number_temporal_layers = oxcf->ts_number_layers; if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || - (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass == 2)) { vp9_init_layer_context(cpi); } @@ -564,9 +608,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cm->bit_depth = oxcf->bit_depth; if (cm->profile <= PROFILE_1) - assert(cm->bit_depth == BITS_8); + assert(cm->bit_depth == VPX_BITS_8); else - assert(cm->bit_depth > BITS_8); + assert(cm->bit_depth > VPX_BITS_8); cpi->oxcf = *oxcf; @@ -618,7 +662,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || - (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass == 2)) { vp9_update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth); } @@ -641,6 +687,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS); } #endif @@ -707,8 +756,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cm->error.setjmp = 1; - vp9_rtcd(); - cpi->use_svc = 0; init_config(cpi, oxcf); @@ -734,6 +781,23 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); + for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, @@ -814,16 +878,16 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cpi->first_time_stamp_ever = INT64_MAX; cal_nmvjointsadcost(cpi->mb.nmvjointsadcost); - cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX]; - cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX]; - cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX]; - cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX]; + cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX]; + cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX]; + cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX]; + cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX]; cal_nmvsadcosts(cpi->mb.nmvsadcost); - cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX]; - cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX]; - cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX]; - cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX]; + cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX]; + cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX]; + cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX]; + cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX]; cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); #if CONFIG_VP9_TEMPORAL_DENOISING @@ -840,8 +904,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { kf_list = fopen("kf_list.stt", "w"); #endif - cpi->output_pkt_list = oxcf->output_pkt_list; - cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; if (oxcf->pass == 1) { @@ -851,7 +913,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); if (cpi->svc.number_spatial_layers > 1 - && cpi->svc.number_temporal_layers == 1) { + || cpi->svc.number_temporal_layers > 1) { FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0}; int i; @@ -909,7 +971,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { } } - set_speed_features(cpi); + vp9_set_speed_features(cpi); // Allocate memory to store variances for a frame. CHECK_MEM_ERROR(cm, cpi->source_diff_var, @@ -1394,40 +1456,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, vp9_extend_frame_borders(dst); } -#define WRITE_RECON_BUFFER 0 -#if WRITE_RECON_BUFFER -void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - FILE *yframe; - int i; - char filename[255]; - - snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->y_height; i++) - fwrite(frame->y_buffer + i * frame->y_stride, - frame->y_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->u_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->v_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); -} -#endif - // Function to test for conditions that indicate we should loop // back and recode a frame. static int recode_loop_test(const VP9_COMP *cpi, @@ -1493,7 +1521,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; } @@ -1507,17 +1535,32 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); + vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); } if (cpi->refresh_golden_frame) { ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + else + vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[ALTREF_FRAME], + sizeof(cpi->interp_filter_selected[ALTREF_FRAME])); } } if (cpi->refresh_last_frame) { ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { @@ -1572,6 +1615,9 @@ void vp9_scale_references(VP9_COMP *cpi) { vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; @@ -1746,7 +1792,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, // to recode. if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { save_coding_context(cpi); - cpi->dummy_packing = 1; if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); @@ -1905,8 +1950,7 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) { if (gold_is_last) flags &= ~VP9_GOLD_FLAG; - if (cpi->rc.frames_till_gf_update_due == INT_MAX && - !is_spatial_svc(cpi)) + if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi)) flags &= ~VP9_GOLD_FLAG; if (alt_is_last) @@ -1947,18 +1991,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, } } -static void configure_skippable_frame(VP9_COMP *cpi) { +static int is_skippable_frame(const VP9_COMP *cpi) { // If the current frame does not have non-zero motion vector detected in the // first pass, and so do its previous and forward frames, then this frame // can be skipped for partition check, and the partition size is assigned // according to the variance + const SVC *const svc = &cpi->svc; + const TWO_PASS *const twopass = is_two_pass_svc(cpi) ? + &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; - SVC *const svc = &cpi->svc; - TWO_PASS *const twopass = is_spatial_svc(cpi) ? - &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; - - cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) && + return (!frame_is_intra_only(&cpi->common) && twopass->stats_in - 2 > twopass->stats_in_start && twopass->stats_in < twopass->stats_in_end && (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion @@ -2008,11 +2050,39 @@ static void set_mv_search_params(VP9_COMP *cpi) { } } + +int setup_interp_filter_search_mask(VP9_COMP *cpi) { + INTERP_FILTER ifilter; + int ref_total[MAX_REF_FRAMES] = {0}; + MV_REFERENCE_FRAME ref; + int mask = 0; + if (cpi->common.last_frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame) + return mask; + for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; + + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) { + if ((ref_total[LAST_FRAME] && + cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && + (ref_total[GOLDEN_FRAME] == 0 || + cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 + < ref_total[GOLDEN_FRAME]) && + (ref_total[ALTREF_FRAME] == 0 || + cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50 + < ref_total[ALTREF_FRAME])) + mask |= 1 << ifilter; + } + return mask; +} + static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; TX_SIZE t; int q; @@ -2046,6 +2116,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, set_mv_search_params(cpi); + if (cpi->oxcf.pass == 2 && + cpi->sf.adaptive_interp_filter_search) + cpi->sf.interp_filter_search_mask = + setup_interp_filter_search_mask(cpi); + + // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm)) { // Reset the loop filter deltas and segmentation map. @@ -2060,9 +2136,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // The alternate reference frame cannot be active for a key frame. cpi->rc.source_alt_ref_active = 0; - cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); - cm->frame_parallel_decoding_mode = - (cpi->oxcf.frame_parallel_decoding_mode != 0); + cm->error_resilient_mode = oxcf->error_resilient_mode; // By default, encoder assumes decoder can use prev_mi. if (cm->error_resilient_mode) { @@ -2070,29 +2144,59 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->reset_frame_context = 0; cm->refresh_frame_context = 0; } else if (cm->intra_only) { + cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; // Only reset the current context. cm->reset_frame_context = 2; } } + if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { + cm->frame_context_idx = + cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id; + + // The probs will be updated based on the frame type of its previous + // frame if frame_parallel_decoding_mode is 0. The type may vary for + // the frame after a key frame in base layer since we may drop enhancement + // layers. So set frame_parallel_decoding_mode to 1 in this case. + if (cpi->svc.number_temporal_layers == 1) { + if (cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) + cm->frame_parallel_decoding_mode = 1; + else + cm->frame_parallel_decoding_mode = 0; + } else if (cpi->svc.spatial_layer_id == 0) { + // Find the 2nd frame in temporal base layer and 1st frame in temporal + // enhancement layers from the key frame. + int i; + for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { + if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + cm->frame_parallel_decoding_mode = 1; + break; + } + } + if (i == cpi->svc.number_temporal_layers) + cm->frame_parallel_decoding_mode = 0; + } + } // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. // Only allowed in second pass of two pass (as requires lagged coding) // and if the relevant speed feature flag is set. - if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) + if (oxcf->pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); // Check if the current frame is skippable for the partition search in the // second pass according to the first pass stats - if (cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_spatial_svc(cpi))) { - configure_skippable_frame(cpi); + if (oxcf->pass == 2 && + (!cpi->use_svc || is_two_pass_svc(cpi))) { + cpi->skippable_frame = is_skippable_frame(cpi); } // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. - if (cpi->oxcf.pass == 0 && - cpi->oxcf.rc_mode == VPX_CBR && + if (oxcf->pass == 0 && + oxcf->rc_mode == VPX_CBR && cm->frame_type != KEY_FRAME) { if (vp9_rc_drop_frame(cpi)) { vp9_rc_postencode_update_drop_frame(cpi); @@ -2104,9 +2208,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_clear_system_state(); #if CONFIG_VP9_POSTPROC - if (cpi->oxcf.noise_sensitivity > 0) { + if (oxcf->noise_sensitivity > 0) { int l = 0; - switch (cpi->oxcf.noise_sensitivity) { + switch (oxcf->noise_sensitivity) { case 1: l = 20; break; @@ -2128,7 +2232,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } #endif - set_speed_features(cpi); +#if CONFIG_INTERNAL_STATS + int i; + for (i = 0; i < MAX_MODES; ++i) + cpi->mode_chosen_counts[i] = 0; +#endif + + vp9_set_speed_features(cpi); + + vp9_set_rd_speed_thresholds(cpi); + vp9_set_rd_speed_thresholds_sub8x8(cpi); // Decide q and q bounds. q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); @@ -2147,7 +2260,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED - if (cpi->oxcf.noise_sensitivity > 0) { + if (oxcf->noise_sensitivity > 0) { vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], yuv_denoised_file); } @@ -2168,29 +2281,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->frame_to_show = get_frame_new_buffer(cm); -#if WRITE_RECON_BUFFER - if (cm->show_frame) - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame); - else - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 1000); -#endif - // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); -#if WRITE_RECON_BUFFER - if (cm->show_frame) - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 2000); - else - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 3000); -#endif - // build the bitstream - cpi->dummy_packing = 0; vp9_pack_bitstream(cpi, dest, size); if (cm->seg.update_map) @@ -2249,8 +2343,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->last_height = cm->height; // reset to normal state now that we are done. - if (!cm->show_existing_frame) - cm->last_show_frame = cm->show_frame; + if (!cm->show_existing_frame) { + if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) + cm->last_show_frame = 0; + else + cm->last_show_frame = cm->show_frame; + } if (cm->show_frame) { vp9_swap_mi_and_prev_mi(cm); @@ -2259,8 +2357,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // update not a real frame ++cm->current_video_frame; if (cpi->use_svc) - vp9_inc_frame_in_layer(&cpi->svc); + vp9_inc_frame_in_layer(cpi); } + + if (is_two_pass_svc(cpi)) + cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type = + cm->frame_type; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -2333,7 +2435,7 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, vpx_usec_timer_start(&timer); #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) + if (is_two_pass_svc(cpi)) res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time, frame_flags); else @@ -2375,8 +2477,8 @@ static int frame_is_reference(const VP9_COMP *cpi) { cm->seg.update_data; } -void adjust_frame_rate(VP9_COMP *cpi) { - const struct lookahead_entry *const source = cpi->source; +void adjust_frame_rate(VP9_COMP *cpi, + const struct lookahead_entry *source) { int64_t this_duration; int step = 0; @@ -2432,7 +2534,8 @@ static int get_arf_src_index(VP9_COMP *cpi) { return arf_src_index; } -static void check_src_altref(VP9_COMP *cpi) { +static void check_src_altref(VP9_COMP *cpi, + const struct lookahead_entry *source) { RATE_CONTROL *const rc = &cpi->rc; if (cpi->oxcf.pass == 2) { @@ -2441,7 +2544,7 @@ static void check_src_altref(VP9_COMP *cpi) { (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); } else { rc->is_src_frame_alt_ref = cpi->alt_ref_source && - (cpi->source == cpi->alt_ref_source); + (source == cpi->alt_ref_source); } if (rc->is_src_frame_alt_ref) { @@ -2463,10 +2566,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, RATE_CONTROL *const rc = &cpi->rc; struct vpx_usec_timer cmptimer; YV12_BUFFER_CONFIG *force_src_buffer = NULL; + struct lookahead_entry *last_source = NULL; + struct lookahead_entry *source = NULL; MV_REFERENCE_FRAME ref_frame; int arf_src_index; - if (is_spatial_svc(cpi) && oxcf->pass == 2) { + if (is_two_pass_svc(cpi) && oxcf->pass == 2) { #if CONFIG_SPATIAL_SVC vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1); #endif @@ -2475,9 +2580,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vpx_usec_timer_start(&cmptimer); - cpi->source = NULL; - cpi->last_source = NULL; - vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); // Normal defaults @@ -2493,17 +2595,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, assert(arf_src_index <= rc->frames_to_key); #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) - cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, - arf_src_index, 0); + if (is_two_pass_svc(cpi)) + source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0); else #endif - cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index); - if (cpi->source != NULL) { - cpi->alt_ref_source = cpi->source; + source = vp9_lookahead_peek(cpi->lookahead, arf_src_index); + if (source != NULL) { + cpi->alt_ref_source = source; #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) { + if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) { int i; // Reference a hidden frame from a lower layer for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { @@ -2534,46 +2635,44 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } - if (!cpi->source) { + if (!source) { // Get last frame source. if (cm->current_video_frame > 0) { #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) - cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); + if (is_two_pass_svc(cpi)) + last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); else #endif - cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1); - if (cpi->last_source == NULL) + last_source = vp9_lookahead_peek(cpi->lookahead, -1); + if (last_source == NULL) return -1; } // Read in the source frame. #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) - cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); + if (is_two_pass_svc(cpi)) + source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else #endif - cpi->source = vp9_lookahead_pop(cpi->lookahead, flush); - if (cpi->source != NULL) { + source = vp9_lookahead_pop(cpi->lookahead, flush); + if (source != NULL) { cm->show_frame = 1; cm->intra_only = 0; // Check to see if the frame should be encoded as an arf overlay. - check_src_altref(cpi); + check_src_altref(cpi, source); } } - if (cpi->source) { + if (source) { cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer - : &cpi->source->img; + : &source->img; - cpi->unscaled_last_source = cpi->last_source != NULL ? - &cpi->last_source->img : NULL; + cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; - *time_stamp = cpi->source->ts_start; - *time_end = cpi->source->ts_end; - *frame_flags = - (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + *time_stamp = source->ts_start; + *time_end = source->ts_end; + *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; } else { *size = 0; @@ -2584,9 +2683,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, return -1; } - if (cpi->source->ts_start < cpi->first_time_stamp_ever) { - cpi->first_time_stamp_ever = cpi->source->ts_start; - cpi->last_end_time_stamp_seen = cpi->source->ts_start; + if (source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_start; } // Clear down mmx registers @@ -2594,7 +2693,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // adjust frame rates based on timestamps given if (cm->show_frame) { - adjust_frame_rate(cpi); + adjust_frame_rate(cpi, source); } if (cpi->svc.number_temporal_layers > 1 && @@ -2636,6 +2735,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); alloc_util_frame_buffers(cpi); @@ -2662,13 +2764,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } if (oxcf->pass == 1 && - (!cpi->use_svc || is_spatial_svc(cpi))) { + (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - vp9_first_pass(cpi); + vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && - (!cpi->use_svc || is_spatial_svc(cpi))) { + (!cpi->use_svc || is_two_pass_svc(cpi))) { Pass2Encode(cpi, size, dest, frame_flags); } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -2691,8 +2793,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Save layer specific state. if ((cpi->svc.number_temporal_layers > 1 && - oxcf->rc_mode == VPX_CBR) || - (cpi->svc.number_spatial_layers > 1 && oxcf->pass == 2)) { + oxcf->rc_mode == VPX_CBR) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { vp9_save_layer_context(cpi); } @@ -2744,12 +2848,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->totalp_sq_error += psnr2.sse[0]; cpi->totalp_samples += psnr2.samples[0]; - frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight); + frame_ssim2 = vp9_calc_ssim(orig, recon, &weight); cpi->summed_quality += frame_ssim2 * weight; cpi->summed_weights += weight; - frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight); + frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight); cpi->summedp_quality += frame_ssim2 * weight; cpi->summedp_weights += weight; @@ -2765,6 +2869,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } + if (cpi->b_calculate_ssimg) { double y, u, v, frame_all; frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v); diff --git a/source/libvpx/vp9/encoder/vp9_encoder.h b/source/libvpx/vp9/encoder/vp9_encoder.h index 82be0f4..0d3c4c1 100644 --- a/source/libvpx/vp9/encoder/vp9_encoder.h +++ b/source/libvpx/vp9/encoder/vp9_encoder.h @@ -114,9 +114,10 @@ typedef enum { typedef struct VP9EncoderConfig { BITSTREAM_PROFILE profile; - BIT_DEPTH bit_depth; + vpx_bit_depth_t bit_depth; // Codec bit-depth. int width; // width of data passed to the compressor int height; // height of data passed to the compressor + unsigned int input_bit_depth; // Input bit depth. double init_framerate; // set to passed in framerate int64_t target_bandwidth; // bandwidth to be used in kilobits per second @@ -203,16 +204,15 @@ typedef struct VP9EncoderConfig { int arnr_max_frames; int arnr_strength; - int arnr_type; int tile_columns; int tile_rows; - struct vpx_fixed_buf two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; + vpx_fixed_buf_t two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; #if CONFIG_FP_MB_STATS - struct vpx_fixed_buf firstpass_mb_stats_in; + vpx_fixed_buf_t firstpass_mb_stats_in; #endif vp8e_tuning tuning; @@ -223,19 +223,13 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } -static INLINE int is_best_mode(MODE mode) { - return mode == BEST; -} - typedef struct VP9_COMP { QUANTS quants; MACROBLOCK mb; VP9_COMMON common; VP9EncoderConfig oxcf; struct lookahead_ctx *lookahead; - struct lookahead_entry *source; struct lookahead_entry *alt_ref_source; - struct lookahead_entry *last_source; YV12_BUFFER_CONFIG *Source; YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames @@ -275,6 +269,11 @@ typedef struct VP9_COMP { CODING_CONTEXT coding_context; + int *nmvcosts[2]; + int *nmvcosts_hp[2]; + int *nmvsadcosts[2]; + int *nmvsadcosts_hp[2]; + int zbin_mode_boost; int zbin_mode_boost_enabled; @@ -286,6 +285,7 @@ typedef struct VP9_COMP { double framerate; vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]; struct vpx_codec_pkt_list *output_pkt_list; @@ -332,7 +332,7 @@ typedef struct VP9_COMP { TWO_PASS twopass; YV12_BUFFER_CONFIG alt_ref_buffer; - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + #if CONFIG_INTERNAL_STATS unsigned int mode_chosen_counts[MAX_MODES]; @@ -371,10 +371,6 @@ typedef struct VP9_COMP { int droppable; - int dummy_packing; /* flag to indicate if packing is dummy */ - - unsigned int tx_stepdown_count[TX_SIZES]; - int initial_width; int initial_height; @@ -393,7 +389,7 @@ typedef struct VP9_COMP { search_site_config ss_cfg; int mbmode_cost[INTRA_MODES]; - unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; + unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES]; int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; @@ -499,16 +495,17 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) { +static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { return cpi->use_svc && - cpi->svc.number_temporal_layers == 1 && - cpi->svc.number_spatial_layers > 1; + (cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2); } static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && (cpi->oxcf.play_alternate && - (!is_spatial_svc(cpi) || + (!is_two_pass_svc(cpi) || cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id])); } @@ -525,6 +522,10 @@ static INLINE int get_chessboard_index(const int frame_index) { return frame_index & 0x1; } +static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) { + return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.c b/source/libvpx/vp9/encoder/vp9_firstpass.c index 94bbe9c..8041b59 100644 --- a/source/libvpx/vp9/encoder/vp9_firstpass.c +++ b/source/libvpx/vp9/encoder/vp9_firstpass.c @@ -76,16 +76,6 @@ static void reset_fpf_position(TWO_PASS *p, p->stats_in = position; } -static int lookup_next_frame_stats(const TWO_PASS *p, - FIRSTPASS_STATS *next_frame) { - if (p->stats_in >= p->stats_in_end) - return EOF; - - *next_frame = *p->stats_in; - return 1; -} - - // Read frame stats at an offset from the current position. static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || @@ -256,7 +246,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { int i; for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { output_stats(&cpi->svc.layer_context[i].twopass.total_stats, @@ -396,7 +386,7 @@ static void set_first_pass_params(VP9_COMP *cpi) { cpi->rc.frames_to_key = INT_MAX; } -void vp9_first_pass(VP9_COMP *cpi) { +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; @@ -428,12 +418,12 @@ void vp9_first_pass(VP9_COMP *cpi) { int neutral_count = 0; int new_mv_count = 0; int sum_in_vectors = 0; - uint32_t lastmv_as_int = 0; + MV lastmv = {0, 0}; TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = {0, 0}; const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ? - &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0; + LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? + &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL; #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -448,13 +438,13 @@ void vp9_first_pass(VP9_COMP *cpi) { if (lc != NULL) { MV_REFERENCE_FRAME ref_frame = LAST_FRAME; - const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL; twopass = &lc->twopass; if (cpi->common.current_video_frame == 0) { cpi->ref_frame_flags = 0; } else { - if (lc->current_video_frame_in_layer == 0) + if (lc->current_video_frame_in_layer < + (unsigned int)cpi->svc.number_temporal_layers) cpi->ref_frame_flags = VP9_GOLD_FLAG; else cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; @@ -464,16 +454,17 @@ void vp9_first_pass(VP9_COMP *cpi) { // Use either last frame or alt frame for motion search. if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); + first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); ref_frame = LAST_FRAME; + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); ref_frame = GOLDEN_FRAME; + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME); } - if (scaled_ref_buf != NULL) - first_ref_buf = scaled_ref_buf; - recon_y_stride = new_yv12->y_stride; recon_uv_stride = new_yv12->uv_stride; uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); @@ -512,9 +503,7 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_tile_init(&tile, cm, 0, 0); for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - int_mv best_ref_mv; - - best_ref_mv.as_int = 0; + MV best_ref_mv = {0, 0}; // Reset above block coeffs. xd->up_available = (mb_row != 0); @@ -594,14 +583,13 @@ void vp9_first_pass(VP9_COMP *cpi) { // Other than for the first frame do a motion search. if (cm->current_video_frame > 0) { int tmp_err, motion_error, raw_motion_error; - int_mv mv, tmp_mv; + // Assume 0,0 motion with no mv overhead. + MV mv = {0, 0} , tmp_mv = {0, 0}; struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); - // Assume 0,0 motion with no mv overhead. - mv.as_int = tmp_mv.as_int = 0; // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on @@ -617,8 +605,7 @@ void vp9_first_pass(VP9_COMP *cpi) { if (raw_motion_error > 25 || lc != NULL) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, - &motion_error); + first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); motion_error = (int)(motion_error * error_weight); @@ -626,9 +613,9 @@ void vp9_first_pass(VP9_COMP *cpi) { // If the current best reference mv is not centered on 0,0 then do a // 0,0 based search as well. - if (best_ref_mv.as_int) { + if (!is_zero_mv(&best_ref_mv)) { tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err); + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); tmp_err = (int)(tmp_err * error_weight); @@ -636,7 +623,7 @@ void vp9_first_pass(VP9_COMP *cpi) { if (tmp_err < motion_error) { motion_error = tmp_err; - mv.as_int = tmp_mv.as_int; + mv = tmp_mv; } } @@ -649,7 +636,7 @@ void vp9_first_pass(VP9_COMP *cpi) { gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); @@ -680,7 +667,8 @@ void vp9_first_pass(VP9_COMP *cpi) { } // Start by assuming that intra mode is best. - best_ref_mv.as_int = 0; + best_ref_mv.row = 0; + best_ref_mv.col = 0; #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -704,25 +692,25 @@ void vp9_first_pass(VP9_COMP *cpi) { this_error < 2 * intrapenalty) ++neutral_count; - mv.as_mv.row *= 8; - mv.as_mv.col *= 8; + mv.row *= 8; + mv.col *= 8; this_error = motion_error; xd->mi[0]->mbmi.mode = NEWMV; - xd->mi[0]->mbmi.mv[0] = mv; + xd->mi[0]->mbmi.mv[0].as_mv = mv; xd->mi[0]->mbmi.tx_size = TX_4X4; xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME; xd->mi[0]->mbmi.ref_frame[1] = NONE; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); vp9_encode_sby_pass1(x, bsize); - sum_mvr += mv.as_mv.row; - sum_mvr_abs += abs(mv.as_mv.row); - sum_mvc += mv.as_mv.col; - sum_mvc_abs += abs(mv.as_mv.col); - sum_mvrs += mv.as_mv.row * mv.as_mv.row; - sum_mvcs += mv.as_mv.col * mv.as_mv.col; + sum_mvr += mv.row; + sum_mvr_abs += abs(mv.row); + sum_mvc += mv.col; + sum_mvc_abs += abs(mv.col); + sum_mvrs += mv.row * mv.row; + sum_mvcs += mv.col * mv.col; ++intercount; - best_ref_mv.as_int = mv.as_int; + best_ref_mv = mv; #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -740,7 +728,7 @@ void vp9_first_pass(VP9_COMP *cpi) { } #endif - if (mv.as_int) { + if (!is_zero_mv(&mv)) { ++mvcount; #if CONFIG_FP_MB_STATS @@ -771,33 +759,33 @@ void vp9_first_pass(VP9_COMP *cpi) { #endif // Non-zero vector, was it different from the last non zero vector? - if (mv.as_int != lastmv_as_int) + if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count; - lastmv_as_int = mv.as_int; + lastmv = mv; // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { - if (mv.as_mv.row > 0) + if (mv.row > 0) --sum_in_vectors; - else if (mv.as_mv.row < 0) + else if (mv.row < 0) ++sum_in_vectors; } else if (mb_row > cm->mb_rows / 2) { - if (mv.as_mv.row > 0) + if (mv.row > 0) ++sum_in_vectors; - else if (mv.as_mv.row < 0) + else if (mv.row < 0) --sum_in_vectors; } // Does the col vector point inwards or outwards? if (mb_col < cm->mb_cols / 2) { - if (mv.as_mv.col > 0) + if (mv.col > 0) --sum_in_vectors; - else if (mv.as_mv.col < 0) + else if (mv.col < 0) ++sum_in_vectors; } else if (mb_col > cm->mb_cols / 2) { - if (mv.as_mv.col > 0) + if (mv.col > 0) ++sum_in_vectors; - else if (mv.as_mv.col < 0) + else if (mv.col < 0) --sum_in_vectors; } } @@ -865,7 +853,7 @@ void vp9_first_pass(VP9_COMP *cpi) { // TODO(paulwilkins): Handle the case when duration is set to 0, or // something less than the full time between subsequent values of // cpi->source_time_stamp. - fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); + fps.duration = (double)(source->ts_end - source->ts_start); // Don't want to do output stats with a stack variable! twopass->this_frame_stats = fps; @@ -927,7 +915,7 @@ void vp9_first_pass(VP9_COMP *cpi) { ++cm->current_video_frame; if (cpi->use_svc) - vp9_inc_frame_in_layer(&cpi->svc); + vp9_inc_frame_in_layer(cpi); } static double calc_correction_factor(double err_per_mb, @@ -965,7 +953,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, BPER_MB_NORMBITS) / num_mbs; int q; int is_svc_upper_layer = 0; - if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) + if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) is_svc_upper_layer = 1; // Try and pick a max Q that will be high enough to encode the @@ -993,9 +981,9 @@ extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const int is_spatial_svc = (svc->number_spatial_layers > 1) && - (svc->number_temporal_layers == 1); - TWO_PASS *const twopass = is_spatial_svc ? + const int is_two_pass_svc = (svc->number_spatial_layers > 1) || + (svc->number_temporal_layers > 1); + TWO_PASS *const twopass = is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; double frame_rate; FIRSTPASS_STATS *stats; @@ -1018,7 +1006,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // It is calculated based on the actual durations of all frames from the // first pass. - if (is_spatial_svc) { + if (is_two_pass_svc) { vp9_update_spatial_layer_framerate(cpi, frame_rate); twopass->bits_left = (int64_t)(stats->duration * svc->layer_context[svc->spatial_layer_id].target_bandwidth / @@ -1033,7 +1021,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // scores used in the second pass. We have this minimum to make sure // that clips that are static but "low complexity" in the intra domain // are still boosted appropriately for KF/GF/ARF. - if (!is_spatial_svc) { + if (!is_two_pass_svc) { // We don't know the number of MBs for each layer at this point. // So we will do it later. twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; @@ -1381,6 +1369,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int mid_boost_bits = 0; int mid_frame_idx; unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; + int alt_frame_index = frame_index; + int has_temporal_layers = is_two_pass_svc(cpi) && + cpi->svc.number_temporal_layers > 1; + + // Only encode alt reference frame in temporal base layer. + if (has_temporal_layers) + alt_frame_index = cpi->svc.number_temporal_layers; key_frame = cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi); @@ -1416,16 +1411,24 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // Store the bits to spend on the ARF if there is one. if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - gf_group->bit_allocation[frame_index] = gf_arf_bits; - gf_group->arf_src_offset[frame_index] = - (unsigned char)(rc->baseline_gf_interval - 1); - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = + gf_group->update_type[alt_frame_index] = ARF_UPDATE; + gf_group->rf_level[alt_frame_index] = GF_ARF_STD; + gf_group->bit_allocation[alt_frame_index] = gf_arf_bits; + + if (has_temporal_layers) + gf_group->arf_src_offset[alt_frame_index] = + (unsigned char)(rc->baseline_gf_interval - + cpi->svc.number_temporal_layers); + else + gf_group->arf_src_offset[alt_frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + + gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[alt_frame_index] = arf_buffer_indices[cpi->multi_arf_last_grp_enabled && rc->source_alt_ref_active]; - ++frame_index; + if (!has_temporal_layers) + ++frame_index; if (cpi->multi_arf_enabled) { // Set aside a slot for a level 1 arf. @@ -1448,6 +1451,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, if (EOF == input_stats(twopass, &frame_stats)) break; + if (has_temporal_layers && frame_index == alt_frame_index) { + ++frame_index; + } + modified_err = calculate_modified_err(twopass, oxcf, &frame_stats); if (group_error > 0) @@ -1669,6 +1676,21 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { else rc->baseline_gf_interval = i; + // Only encode alt reference frame in temporal base layer. So + // baseline_gf_interval should be multiple of a temporal layer group + // (typically the frame distance between two base layer frames) + if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { + int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; + int new_gf_interval = (rc->baseline_gf_interval + count) & (~count); + int j; + for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { + if (EOF == input_stats(twopass, this_frame)) + break; + gf_group_err += calculate_modified_err(twopass, oxcf, this_frame); + } + rc->baseline_gf_interval = new_gf_interval; + } + rc->frames_till_gf_update_due = rc->baseline_gf_interval; // Should we use the alternate reference frame. @@ -1874,16 +1896,17 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { input_stats(twopass, this_frame); // Provided that we are not at the end of the file... - if (cpi->oxcf.auto_key && - lookup_next_frame_stats(twopass, &next_frame) != EOF) { + if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { double loop_decay_rate; // Check for a scene cut. - if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame)) + if (test_candidate_kf(twopass, &last_frame, this_frame, + twopass->stats_in)) break; // How fast is the prediction quality decaying? - loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); + loop_decay_rate = get_prediction_decay_rate(&cpi->common, + twopass->stats_in); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction @@ -1940,6 +1963,18 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->next_key_frame_forced = 0; } + if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { + int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; + int new_frame_to_key = (rc->frames_to_key + count) & (~count); + int j; + for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { + if (EOF == input_stats(twopass, this_frame)) + break; + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); + } + rc->frames_to_key = new_frame_to_key; + } + // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. @@ -2098,7 +2133,7 @@ void configure_buffer_updates(VP9_COMP *cpi) { assert(0); break; } - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) cpi->refresh_golden_frame = 0; if (cpi->alt_ref_source == NULL) @@ -2117,7 +2152,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame_copy; int target_rate; - LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ? + LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0; if (lc != NULL) { @@ -2200,15 +2235,18 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (lc != NULL) { if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = (cm->frame_type == KEY_FRAME); - if (lc->is_key_frame) + if (lc->is_key_frame) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + lc->frames_from_key_frame = 0; + } } else { cm->frame_type = INTER_FRAME; lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; if (lc->is_key_frame) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + lc->frames_from_key_frame = 0; } } } diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.h b/source/libvpx/vp9/encoder/vp9_firstpass.h index bf8c9fd..aaa6b03 100644 --- a/source/libvpx/vp9/encoder/vp9_firstpass.h +++ b/source/libvpx/vp9/encoder/vp9_firstpass.h @@ -121,7 +121,7 @@ struct VP9_COMP; void vp9_init_first_pass(struct VP9_COMP *cpi); void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); -void vp9_first_pass(struct VP9_COMP *cpi); +void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); void vp9_init_second_pass(struct VP9_COMP *cpi); diff --git a/source/libvpx/vp9/encoder/vp9_lookahead.c b/source/libvpx/vp9/encoder/vp9_lookahead.c index e743517..823e7a1 100644 --- a/source/libvpx/vp9/encoder/vp9_lookahead.c +++ b/source/libvpx/vp9/encoder/vp9_lookahead.c @@ -50,6 +50,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif unsigned int depth) { struct lookahead_ctx *ctx = NULL; @@ -70,6 +73,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, for (i = 0; i < depth; i++) if (vp9_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS)) goto bail; } diff --git a/source/libvpx/vp9/encoder/vp9_lookahead.h b/source/libvpx/vp9/encoder/vp9_lookahead.h index 678c51a..2786193 100644 --- a/source/libvpx/vp9/encoder/vp9_lookahead.h +++ b/source/libvpx/vp9/encoder/vp9_lookahead.h @@ -56,6 +56,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif unsigned int depth); diff --git a/source/libvpx/vp9/encoder/vp9_mbgraph.c b/source/libvpx/vp9/encoder/vp9_mbgraph.c index 6e04e2a..b8e7164 100644 --- a/source/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/source/libvpx/vp9/encoder/vp9_mbgraph.c @@ -34,6 +34,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const int tmp_row_min = x->mv_row_min; const int tmp_row_max = x->mv_row_max; MV ref_full; + int sad_list[5]; // Further step/diamond searches as necessary int step_param = mv_sf->reduce_first_step_size; @@ -45,8 +46,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, ref_full.row = ref_mv->row >> 3; /*cpi->sf.search_method == HEX*/ - vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0, - ref_mv, dst_mv); + vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, + cond_sad_list(cpi, sad_list), + &v_fn_ptr, 0, ref_mv, dst_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) @@ -55,8 +57,10 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, unsigned int sse; cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion, - &sse, NULL, 0, 0); + &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + NULL, NULL, + &distortion, &sse, NULL, 0, 0); } xd->mi[0]->mbmi.mode = NEWMV; diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.c b/source/libvpx/vp9/encoder/vp9_mcomp.c index ae924d5..d6f6b25 100644 --- a/source/libvpx/vp9/encoder/vp9_mcomp.c +++ b/source/libvpx/vp9/encoder/vp9_mcomp.c @@ -256,6 +256,137 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } \ } +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const z = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = INT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int rr = ref_mv->row; \ + int rc = ref_mv->col; \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); \ + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); \ + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); \ + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); \ + int tr = br; \ + int tc = bc; \ + \ + bestmv->row *= 8; \ + bestmv->col *= 8; \ + if (second_pred != NULL) { \ + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \ + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \ + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \ + } else { \ + besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \ + } \ + *distortion = besterr; \ + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + +int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, + MV *bestmv, const MV *ref_mv, + int allow_hp, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *sad_list, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + SETUP_SUBPEL_SEARCH; + + if (sad_list && + sad_list[0] != INT_MAX && sad_list[1] != INT_MAX && + sad_list[2] != INT_MAX && sad_list[3] != INT_MAX && + sad_list[4] != INT_MAX) { + unsigned int left, right, up, down, diag; + whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) + + (sad_list[2] < sad_list[4] ? 0 : 2); + switch (whichdir) { + case 0: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; + + bestmv->row = br; + bestmv->col = bc; + + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; + + return besterr; +} + int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, @@ -263,55 +394,14 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *sad_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w, int h) { - const uint8_t *const z = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - const MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - int thismse; - const unsigned int halfiters = iters_per_step; - const unsigned int quarteriters = iters_per_step; - const unsigned int eighthiters = iters_per_step; - - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - const uint8_t *const y = xd->plane[0].pre[0].buf; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row *= 8; - bestmv->col *= 8; - - // calculate central point error - // TODO(yunqingwang): central pointer error was already calculated in full- - // pixel search, and can be passed in this function. - if (second_pred != NULL) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); - } else { - besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); - } - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + SETUP_SUBPEL_SEARCH; + (void) sad_list; // to silence compiler warning // Each subsequent iteration checks at least one point in // common with the last iteration could be 2 ( if diag selected) @@ -398,14 +488,17 @@ static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) { // Each scale can have a different number of candidates and shape of // candidates as indicated in the num_candidates and candidates arrays // passed into this function +// static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, - int do_init_search, int do_refine, + int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - const MV *center_mv, MV *best_mv, + const MV *center_mv, + MV *best_mv, const int num_candidates[MAX_PATTERN_SCALES], const MV candidates[MAX_PATTERN_SCALES] [MAX_PATTERN_CANDIDATES]) { @@ -413,7 +506,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }; - int i, j, s, t; + int i, s, t; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; int br, bc; @@ -552,47 +645,38 @@ static int vp9_pattern_search(const MACROBLOCK *x, } while (s--); } - // Check 4 1-away neighbors if do_refine is true. - // For most well-designed schemes do_refine will not be necessary. - if (do_refine) { - static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}}; - - for (j = 0; j < 16; j++) { - int best_site = -1; - if (check_bounds(x, br, bc, 1)) { - for (i = 0; i < 4; i++) { - const MV this_mv = {br + neighbors[i].row, - bc + neighbors[i].col}; - thissad = vfp->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &this_mv), - in_what->stride); - CHECK_BETTER - } - } else { - for (i = 0; i < 4; i++) { - const MV this_mv = {br + neighbors[i].row, - bc + neighbors[i].col}; - if (!is_mv_in(x, &this_mv)) - continue; - thissad = vfp->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &this_mv), - in_what->stride); - CHECK_BETTER - } + // Returns the one-away integer pel sad values around the best as follows: + // sad_list[0]: sad at the best integer pel + // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel + // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel + // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel + // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel + if (sad_list) { + static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}}; + sad_list[0] = bestsad; + if (check_bounds(x, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); } - - if (best_site == -1) { - break; - } else { - br += neighbors[best_site].row; - bc += neighbors[best_site].col; + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + if (!is_mv_in(x, &this_mv)) + sad_list[i + 1] = INT_MAX; + else + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); } } } - best_mv->row = br; best_mv->col = bc; - return bestsad; } @@ -634,6 +718,7 @@ int vp9_hex_search(const MACROBLOCK *x, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, MV *best_mv) { @@ -658,7 +743,7 @@ int vp9_hex_search(const MACROBLOCK *x, { -1024, 0}}, }; return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, + do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv, hex_num_candidates, hex_candidates); } @@ -668,6 +753,7 @@ int vp9_bigdia_search(const MACROBLOCK *x, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, @@ -699,7 +785,7 @@ int vp9_bigdia_search(const MACROBLOCK *x, {-512, 512}, {-1024, 0}}, }; return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, + do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv, bigdia_num_candidates, bigdia_candidates); } @@ -709,6 +795,7 @@ int vp9_square_search(const MACROBLOCK *x, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, @@ -740,7 +827,7 @@ int vp9_square_search(const MACROBLOCK *x, {0, 1024}, {-1024, 1024}, {-1024, 0}}, }; return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, + do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv, square_num_candidates, square_candidates); } @@ -750,12 +837,13 @@ int vp9_fast_hex_search(const MACROBLOCK *x, int search_param, int sad_per_bit, int do_init_search, // must be zero for fast_hex + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, MV *best_mv) { return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param), - sad_per_bit, do_init_search, vfp, use_mvcost, + sad_per_bit, do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv); } @@ -764,13 +852,14 @@ int vp9_fast_dia_search(const MACROBLOCK *x, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, MV *best_mv) { return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param), - sad_per_bit, do_init_search, vfp, use_mvcost, - center_mv, best_mv); + sad_per_bit, do_init_search, sad_list, vfp, + use_mvcost, center_mv, best_mv); } #undef CHECK_BETTER @@ -1368,33 +1457,41 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int error_per_bit, + int *sad_list, const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; const SEARCH_METHODS method = sf->mv.search_method; vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; + if (sad_list) { + sad_list[0] = INT_MAX; + sad_list[1] = INT_MAX; + sad_list[2] = INT_MAX; + sad_list[3] = INT_MAX; + sad_list[4] = INT_MAX; + } switch (method) { case FAST_DIAMOND: var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, - fn_ptr, 1, ref_mv, tmp_mv); + sad_list, fn_ptr, 1, ref_mv, tmp_mv); break; case FAST_HEX: var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, - fn_ptr, 1, ref_mv, tmp_mv); + sad_list, fn_ptr, 1, ref_mv, tmp_mv); break; case HEX: var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); + sad_list, fn_ptr, 1, ref_mv, tmp_mv); break; case SQUARE: var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); + sad_list, fn_ptr, 1, ref_mv, tmp_mv); break; case BIGDIA: var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); + sad_list, fn_ptr, 1, ref_mv, tmp_mv); break; case NSTEP: var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.h b/source/libvpx/vp9/encoder/vp9_mcomp.h index 298fbb6..9b4734a 100644 --- a/source/libvpx/vp9/encoder/vp9_mcomp.h +++ b/source/libvpx/vp9/encoder/vp9_mcomp.h @@ -79,6 +79,7 @@ typedef int (integer_mv_pattern_search_fn) ( int search_param, int error_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vf, int use_mvcost, const MV *center_mv, @@ -98,12 +99,14 @@ typedef int (fractional_mv_step_fp) ( const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, + int *sad_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w, int h); extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, @@ -136,8 +139,10 @@ struct VP9_COMP; int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int error_per_bit, + int *sad_list, const MV *ref_mv, MV *tmp_mv, int var_max, int rd); + #ifdef __cplusplus } // extern "C" #endif diff --git a/source/libvpx/vp9/encoder/vp9_pickmode.c b/source/libvpx/vp9/encoder/vp9_pickmode.c index 5646f5b..eee6ffe 100644 --- a/source/libvpx/vp9/encoder/vp9_pickmode.c +++ b/source/libvpx/vp9/encoder/vp9_pickmode.c @@ -126,6 +126,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const int tmp_row_min = x->mv_row_min; const int tmp_row_max = x->mv_row_max; int rv = 0; + int sad_list[5]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); if (cpi->common.show_frame && @@ -152,8 +153,9 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv, - &tmp_mv->as_mv, INT_MAX, 0); + vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + cond_sad_list(cpi, sad_list), + &ref_mv, &tmp_mv->as_mv, INT_MAX, 0); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -179,6 +181,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); x->pred_mv[ref] = tmp_mv->as_mv; @@ -391,7 +394,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, args->dist += dist; } -static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = { +static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = { {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV}, {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG}, {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA}, @@ -420,7 +423,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; int64_t this_rd = INT64_MAX; - int skip_txfm = 0; + uint8_t skip_txfm = 0; int rate = INT_MAX; int64_t dist = INT64_MAX; // var_y and sse_y are saved to be used in skipping checking @@ -544,7 +547,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; mode_rd_thresh = rd_threshes[mode_idx[ref_frame - LAST_FRAME] - [this_mode - NEARESTMV]]; + [INTER_OFFSET(this_mode)]]; if (rd_less_than_thresh(best_rd, mode_rd_thresh, rd_thresh_freq_fact[this_mode])) continue; @@ -656,8 +659,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { - vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, - this_mode, ctx); + vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx); } #endif diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.c b/source/libvpx/vp9/encoder/vp9_ratectrl.c index b926a58..b607c85 100644 --- a/source/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/source/libvpx/vp9/encoder/vp9_ratectrl.c @@ -1235,7 +1235,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); @@ -1247,7 +1247,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { } else { cm->frame_type = INTER_FRAME; - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = 0; diff --git a/source/libvpx/vp9/encoder/vp9_rd.c b/source/libvpx/vp9/encoder/vp9_rd.c index 4fc3e9e..b826ff4 100644 --- a/source/libvpx/vp9/encoder/vp9_rd.c +++ b/source/libvpx/vp9/encoder/vp9_rd.c @@ -364,20 +364,16 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, int ref_frame, BLOCK_SIZE block_size) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int_mv this_mv; int i; int zero_seen = 0; int best_index = 0; int best_sad = INT_MAX; int this_sad = INT_MAX; int max_mv = 0; - uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; - int row_offset, col_offset; - int num_mv_refs = MAX_MV_REF_CANDIDATES + + const int num_mv_refs = MAX_MV_REF_CANDIDATES + (cpi->sf.adaptive_motion_search && - cpi->common.show_frame && block_size < cpi->sf.max_partition_size); MV pred_mv[3]; @@ -387,19 +383,16 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, // Get the sad for each candidate reference mv. for (i = 0; i < num_mv_refs; ++i) { - this_mv.as_mv = pred_mv[i]; + const MV *this_mv = &pred_mv[i]; - max_mv = MAX(max_mv, - MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); - // Only need to check zero mv once. - if (!this_mv.as_int && zero_seen) + max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + if (is_zero_mv(this_mv) && zero_seen) continue; - zero_seen = zero_seen || !this_mv.as_int; + zero_seen |= is_zero_mv(this_mv); - row_offset = this_mv.as_mv.row >> 3; - col_offset = this_mv.as_mv.col >> 3; - ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset; + ref_y_ptr = + &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)]; // Find sad for current vector. this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, @@ -462,7 +455,7 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) { // Set baseline threshold values. for (i = 0; i < MAX_MODES; ++i) - rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0; + rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0; rd->thresh_mult[THR_NEARESTMV] = 0; rd->thresh_mult[THR_NEARESTG] = 0; @@ -548,7 +541,7 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { int i; for (i = 0; i < MAX_REFS; ++i) - rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0; + rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0; rd->thresh_mult_sub8x8[THR_LAST] += 2500; rd->thresh_mult_sub8x8[THR_GOLD] += 2500; diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.c b/source/libvpx/vp9/encoder/vp9_rdopt.c index cfda964..506c9bc 100644 --- a/source/libvpx/vp9/encoder/vp9_rdopt.c +++ b/source/libvpx/vp9/encoder/vp9_rdopt.c @@ -171,30 +171,53 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; const int ref = xd->mi[0]->mbmi.ref_frame[0]; unsigned int sse; + unsigned int var = 0; + unsigned int sum_sse = 0; const int shift = 8; + int rate; + int64_t dist; + + x->pred_sse[ref] = 0; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size]; + int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int idx, idy; + int lw = b_width_log2_lookup[unit_size] + 2; + int lh = b_height_log2_lookup[unit_size] + 2; + + sum_sse = 0; + + for (idy = 0; idy < bh; ++idy) { + for (idx = 0; idx < bw; ++idx) { + uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); + uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh); + int block_idx = (idy << 1) + idx; + + var = cpi->fn_ptr[unit_size].vf(src, p->src.stride, + dst, pd->dst.stride, &sse); + x->bsse[(i << 2) + block_idx] = sse; + sum_sse += sse; + + if (!x->select_tx_size) { + if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift) + x->skip_txfm[(i << 2) + block_idx] = 1; + else if (var < p->quant_thred[1] >> shift) + x->skip_txfm[(i << 2) + block_idx] = 2; + else + x->skip_txfm[(i << 2) + block_idx] = 0; + } - const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, - &sse); - - if (!x->select_tx_size) { - if (sse < p->quant_thred[0] >> shift) - x->skip_txfm[i] = 1; - else if (var < p->quant_thred[1] >> shift) - x->skip_txfm[i] = 2; - else - x->skip_txfm[i] = 0; + if (i == 0) + x->pred_sse[ref] += sse; + } } - x->bsse[i] = sse; - if (i == 0) - x->pred_sse[ref] = sse; - // Fast approximate the modelling function. if (cpi->oxcf.speed > 4) { int64_t rate; @@ -210,9 +233,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, rate_sum += rate; dist_sum += dist; } else { - int rate; - int64_t dist; - vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], + vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; dist_sum += dist; @@ -372,17 +393,17 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (!is_inter_block(mbmi)) { vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); dist_block(plane, block, tx_size, args); - } else { - if (x->skip_txfm[plane] == 0) { + } else if (max_txsize_lookup[plane_bsize] == tx_size) { + if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) { // full forward transform and quantization vp9_xform_quant(x, plane, block, plane_bsize, tx_size); dist_block(plane, block, tx_size, args); - } else if (x->skip_txfm[plane] == 2) { + } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { // compute DC coefficient int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); - args->sse = x->bsse[plane] << 4; + args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; args->dist = args->sse; if (!x->plane[plane].eobs[block]) args->dist = args->sse - ((coeff[0] * coeff[0] - @@ -390,9 +411,13 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, } else { // skip forward transform x->plane[plane].eobs[block] = 0; - args->sse = x->bsse[plane] << 4; + args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; args->dist = args->sse; } + } else { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + dist_block(plane, block, tx_size, args); } rate_block(plane, block, plane_bsize, tx_size, args); @@ -468,7 +493,6 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, txfm_rd_in_plane(x, rate, distortion, skip, sse, ref_best_rd, 0, bs, mbmi->tx_size, cpi->sf.use_fast_coef_costing); - cpi->tx_stepdown_count[0]++; } static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, @@ -551,60 +575,36 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, if (max_tx_size == TX_32X32 && best_tx == TX_32X32) { tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1]; - cpi->tx_stepdown_count[0]++; } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) { tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1]; - cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1]; - cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; } else { tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1]; - cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; } } -static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, - int64_t *distortion, int *skip, - int64_t *psse, BLOCK_SIZE bs, - int64_t txfm_cache[TX_MODES], - int64_t ref_best_rd) { +static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, + int64_t *psse, BLOCK_SIZE bs, + int64_t txfm_cache[TX_MODES], + int64_t ref_best_rd) { MACROBLOCKD *xd = &x->e_mbd; + int64_t sse; + int64_t *ret_sse = psse ? psse : &sse; assert(bs == xd->mi[0]->mbmi.sb_type); - vp9_subtract_plane(x, bs, 0); - if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd, + choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, bs); } else { - choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse, + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, txfm_cache, ref_best_rd, bs); } } -static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, - int64_t *distortion, int *skip, - BLOCK_SIZE bs, - int64_t txfm_cache[TX_MODES], - int64_t ref_best_rd) { - MACROBLOCKD *xd = &x->e_mbd; - int64_t sse; - - assert(bs == xd->mi[0]->mbmi.sb_type); - if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) { - vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd, - bs); - } else { - choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse, - txfm_cache, ref_best_rd, bs); - } -} - - static int conditional_skipintra(PREDICTION_MODE mode, PREDICTION_MODE best_intra_mode) { if (mode == D117_PRED && @@ -854,8 +854,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } mic->mbmi.mode = mode; - intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, - &s, bsize, local_tx_cache, best_rd); + super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, + &s, NULL, bsize, local_tx_cache, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1365,13 +1365,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, int sadpb = x->sadperbit4; MV mvp_full; int max_mv; + int sad_list[5]; /* Is the best so far sufficiently good that we cant justify doing * and new motion search. */ if (best_rd < label_mv_thresh) break; - if (!is_best_mode(cpi->oxcf.mode)) { + if (cpi->oxcf.mode != BEST) { // use previous block's result as next block's MV predictor. if (i > 0) { bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; @@ -1397,7 +1398,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.row = bsi->mvp.as_mv.row >> 3; mvp_full.col = bsi->mvp.as_mv.col >> 3; - if (cpi->sf.adaptive_motion_search && cm->show_frame) { + if (cpi->sf.adaptive_motion_search) { mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3; mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3; step_param = MAX(step_param, 8); @@ -1408,12 +1409,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv); - bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, &bsi->ref_mv[0]->as_mv, new_mv, - INT_MAX, 1); + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL, + &bsi->ref_mv[0]->as_mv, new_mv, + INT_MAX, 1); // Should we do a full search (best quality only) - if (is_best_mode(cpi->oxcf.mode)) { + if (cpi->oxcf.mode == BEST) { int_mv *const best_mv = &mi->bmi[i].as_mv[0]; /* Check if mvp_full is within the range. */ clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, @@ -1422,6 +1425,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, sadpb, 16, &cpi->fn_ptr[bsize], &bsi->ref_mv[0]->as_mv, &best_mv->as_mv); + sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX; if (thissme < bestsme) { bestsme = thissme; *new_mv = best_mv->as_mv; @@ -1434,17 +1438,19 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int distortion; - cpi->find_fractional_mv_step(x, - new_mv, - &bsi->ref_mv[0]->as_mv, - cm->allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], - cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, - x->nmvjointcost, x->mvcost, - &distortion, - &x->pred_sse[mbmi->ref_frame[0]], - NULL, 0, 0); + cpi->find_fractional_mv_step( + x, + new_mv, + &bsi->ref_mv[0]->as_mv, + cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + x->nmvjointcost, x->mvcost, + &distortion, + &x->pred_sse[mbmi->ref_frame[0]], + NULL, 0, 0); // save motion search result for use in compound prediction seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv; @@ -1701,12 +1707,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, int64_t comp_pred_diff[REFERENCE_MODES], const int64_t tx_size_diff[TX_MODES], - int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) { + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS], + int skippable) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->skip = x->skip; + ctx->skippable = skippable; ctx->best_mode_index = mode_index; ctx->mic = *xd->mi[0]; ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; @@ -1772,6 +1780,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int tmp_col_max = x->mv_col_max; int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; + int sad_list[5]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); @@ -1806,8 +1815,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, step_param = cpi->mv_step_param; } - if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && - cm->show_frame) { + if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize), b_width_log2(bsize))); step_param = MAX(step_param, boffset); @@ -1844,6 +1852,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.row >>= 3; bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + cond_sad_list(cpi, sad_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_col_min = tmp_col_min; @@ -1859,13 +1868,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search && cm->show_frame) + if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { @@ -1983,6 +1993,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->errorperbit, &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, + NULL, x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, pw, ph); @@ -2118,6 +2129,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], + INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], + int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, const int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; @@ -2135,14 +2148,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); int pred_exists = 0; int intpel_mv; - int64_t rd, best_rd = INT64_MAX; + int64_t rd, tmp_rd, best_rd = INT64_MAX; int best_needs_copy = 0; uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; INTERP_FILTER best_filter = SWITCHABLE; - int skip_txfm[MAX_MB_PLANE] = {0}; - int64_t bsse[MAX_MB_PLANE] = {0}; + uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0}; + int64_t bsse[MAX_MB_PLANE << 2] = {0}; int bsl = mi_width_log2_lookup[bsize]; int pred_filter_search = cpi->sf.cb_pred_filter_search ? @@ -2164,6 +2177,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (frame_mv[refs[0]].as_int == INVALID_MV || frame_mv[refs[1]].as_int == INVALID_MV) return INT64_MAX; + + if (cpi->sf.adaptive_mode_search) { + if (single_filter[this_mode][refs[0]] == + single_filter[this_mode][refs[1]]) + best_filter = single_filter[this_mode][refs[0]]; + } } if (this_mode == NEWMV) { @@ -2225,6 +2244,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * if the first is known */ *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); + if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && + mbmi->mode != NEARESTMV) + return INT64_MAX; + pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv); @@ -2263,6 +2286,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } else { int rate_sum = 0; int64_t dist_sum = 0; + if (i > 0 && cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << i))) { + rate_sum = INT_MAX; + dist_sum = INT64_MAX; + continue; + } + if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) || (cm->interp_filter != SWITCHABLE && @@ -2313,6 +2343,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (cm->interp_filter != SWITCHABLE && cm->interp_filter == mbmi->interp_filter)) { pred_exists = 1; + tmp_rd = best_rd; } } restore_dst_buf(xd, orig_dst, orig_dst_stride); @@ -2331,17 +2362,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[i].dst.stride = 64; } } + rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0); } else { + int tmp_rate; + int64_t tmp_dist; // Handles the special case when a filter that is not in the - // switchable list (ex. bilinear, 6-tap) is indicated at the frame level + // switchable list (ex. bilinear) is indicated at the frame level, or + // skip condition holds. vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); + model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist); + rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } + if (!is_comp_pred) + single_filter[this_mode][refs[0]] = mbmi->interp_filter; + + if (cpi->sf.adaptive_mode_search) + if (is_comp_pred) + if (single_skippable[this_mode][refs[0]] && + single_skippable[this_mode][refs[1]]) + vpx_memset(skip_txfm, 1, sizeof(skip_txfm)); + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - int tmp_rate; - int64_t tmp_dist; - model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist); - rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); // if current pred_error modeled rd is substantially more than the best // so far, do not bother doing full rd if (rd / 2 > ref_best_rd) { @@ -2351,7 +2395,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (cm->interp_filter == SWITCHABLE) - *rate2 += vp9_get_switchable_rate(cpi); + *rate2 += rs; if (!is_comp_pred) { if (cpi->allow_encode_breakout) @@ -2368,8 +2412,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int64_t rdcosty = INT64_MAX; // Y cost and distortion - inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, - bsize, txfm_cache, ref_best_rd); + vp9_subtract_plane(x, bsize, 0); + super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, + bsize, txfm_cache, ref_best_rd); if (*rate_y == INT_MAX) { *rate2 = INT_MAX; @@ -2399,6 +2444,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = skippable_y && skippable_uv; } + if (!is_comp_pred) + single_skippable[this_mode][refs[0]] = *skippable; + restore_dst_buf(xd, orig_dst, orig_dst_stride); return this_rd; // if 0, this will be re-calculated by caller } @@ -2505,10 +2553,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; - int comp_pred, i; + int comp_pred, i, k; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; + INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; @@ -2519,6 +2569,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; MB_MODE_INFO best_mbmode; + int best_mode_skippable = 0; int mode_index, best_mode_index = -1; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; @@ -2556,6 +2607,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate_uv_intra[i] = INT_MAX; for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = 0; i < MB_MODE_COUNT; ++i) { + for (k = 0; k < MAX_REF_FRAMES; ++k) { + single_inter_filter[i][k] = SWITCHABLE; + single_skippable[i][k] = 0; + } + } *returnrate = INT_MAX; @@ -2732,6 +2789,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { + if (!cm->allow_comp_inter_inter) + continue; + if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && best_mode_index >=0 && vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) @@ -2747,6 +2807,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (ref_frame == INTRA_FRAME) { + if (cpi->sf.adaptive_mode_search) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_intra_rd) + continue; + if (!(intra_y_mode_mask & (1 << this_mode))) continue; if (this_mode != DC_PRED) { @@ -2785,6 +2849,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // them for this frame. mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + x->skip = 0; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); @@ -2800,8 +2866,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, - bsize, tx_cache, best_rd); + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, + NULL, bsize, tx_cache, best_rd); if (rate_y == INT_MAX) continue; @@ -2831,7 +2897,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &rate_uv, &distortion_uv, &disable_skip, frame_mv, mi_row, mi_col, - single_newmv, &total_sse, best_rd); + single_newmv, single_inter_filter, + single_skippable, &total_sse, best_rd); if (this_rd == INT64_MAX) continue; @@ -2919,6 +2986,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* required for left and above block mv */ mbmi->mv[0].as_int = 0; max_plane = 1; + } else { + best_intra_rd = x->pred_sse[ref_frame]; } *returnrate = rate2; @@ -2926,6 +2995,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; + best_mode_skippable = skippable; + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], @@ -3025,6 +3096,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, break; } + // The inter modes' rate costs are not calculated precisely in some cases. + // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and + // ZEROMV. Here, checks are added for those cases, and the mode decisions + // are corrected. + if (best_mbmode.mode == NEWMV) { + const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0], + best_mbmode.ref_frame[1]}; + int comp_pred_mode = refs[1] > INTRA_FRAME; + + if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) || !comp_pred_mode)) + best_mbmode.mode = NEARESTMV; + else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) || !comp_pred_mode)) + best_mbmode.mode = NEARMV; + else if (best_mbmode.mv[0].as_int == 0 && + ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode)) + best_mbmode.mode = ZEROMV; + } + if (best_mode_index < 0 || best_rd >= best_rd_so_far) return INT64_MAX; @@ -3082,8 +3175,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - store_coding_context(x, ctx, best_mode_index, - best_pred_diff, best_tx_diff, best_filter_diff); + store_coding_context(x, ctx, best_mode_index, best_pred_diff, + best_tx_diff, best_filter_diff, best_mode_skippable); return best_rd; } @@ -3188,7 +3281,7 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE); store_coding_context(x, ctx, THR_ZEROMV, - best_pred_diff, best_tx_diff, best_filter_diff); + best_pred_diff, best_tx_diff, best_filter_diff, 0); return this_rd; } @@ -3325,6 +3418,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { + if (!cm->allow_comp_inter_inter) + continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame @@ -3793,7 +3889,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); store_coding_context(x, ctx, best_ref_index, - best_pred_diff, best_tx_diff, best_filter_diff); + best_pred_diff, best_tx_diff, best_filter_diff, 0); return best_rd; } diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.c b/source/libvpx/vp9/encoder/vp9_speed_features.c index 57835ec..dbf4ae9 100644 --- a/source/libvpx/vp9/encoder/vp9_speed_features.c +++ b/source/libvpx/vp9/encoder/vp9_speed_features.c @@ -65,7 +65,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, const int boosted = frame_is_boosted(cpi); sf->adaptive_rd_thresh = 1; - sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW; sf->allow_skip_recode = 1; if (speed >= 1) { @@ -92,6 +91,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; sf->tx_size_search_breakout = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 23); + else + sf->partition_search_breakout_dist_thr = (1 << 21); + sf->partition_search_breakout_rate_thr = 500; } if (speed >= 2) { @@ -120,6 +125,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; sf->adjust_partitioning_from_last_frame = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 24); + else + sf->partition_search_breakout_dist_thr = (1 << 22); + sf->partition_search_breakout_rate_thr = 700; } if (speed >= 3) { @@ -132,17 +143,25 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; } sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; sf->cb_pred_filter_search = 1; sf->alt_ref_search_fp = 1; sf->motion_field_mode_search = !boosted; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; - sf->last_partitioning_redo_frequency = 3; + sf->last_partitioning_redo_frequency = 2; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; + sf->adaptive_interp_filter_search = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 25); + else + sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_rate_thr = 1000; } if (speed >= 4) { @@ -157,6 +176,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->use_lp32x32fdct = 1; sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 26); + else + sf->partition_search_breakout_dist_thr = (1 << 24); + sf->partition_search_breakout_rate_thr = 1500; } if (speed >= 5) { @@ -180,8 +205,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; - const int frames_since_key = - cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key; + const int is_keyframe = cm->frame_type == KEY_FRAME; + const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; sf->use_fast_coef_costing = 1; @@ -277,17 +302,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, } if (speed >= 5) { - sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1; - sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ? - RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; + sf->use_quant_fp = !is_keyframe; + sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX + : STRICT_NEIGHBORING_MIN_MAX; sf->max_partition_size = BLOCK_32X32; sf->min_partition_size = BLOCK_8X8; sf->partition_check = (frames_since_key % sf->last_partitioning_redo_frequency == 1); - sf->force_frame_boost = cm->frame_type == KEY_FRAME || - (frames_since_key % - (sf->last_partitioning_redo_frequency << 1) == 1); - sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15; + sf->force_frame_boost = is_keyframe || + (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); + sf->max_delta_qindex = is_keyframe ? 20 : 15; sf->partition_search_type = REFERENCE_PARTITION; sf->use_nonrd_pick_mode = 1; sf->allow_skip_recode = 0; @@ -305,8 +329,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->partition_search_type = SOURCE_VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; - sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ? - USE_LARGESTALL : USE_TX_8X8; + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; // This feature is only enabled when partition search is disabled. sf->reuse_inter_pred_sby = 1; @@ -316,6 +339,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->mv.reduce_first_step_size = 1; } + if (speed >= 7) { sf->mv.search_method = FAST_DIAMOND; sf->mv.fullpel_search_step_param = 10; @@ -324,10 +348,12 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, 800 : 300; sf->elevate_newmv_thresh = 2500; } + if (speed >= 12) { sf->elevate_newmv_thresh = 4000; sf->mv.subpel_force_stop = 2; } + if (speed >= 13) { int i; sf->max_intra_bsize = BLOCK_32X32; @@ -360,6 +386,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 0; sf->cb_pred_filter_search = 0; sf->cb_partition_search = 0; sf->motion_field_mode_search = 0; @@ -380,6 +407,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->force_frame_boost = 0; sf->max_delta_qindex = 0; sf->disable_filter_search_var_thresh = 0; + sf->adaptive_interp_filter_search = 0; + for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; sf->intra_uv_mode_mask[i] = INTRA_ALL; @@ -407,17 +436,17 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->recode_tolerance = 25; sf->default_interp_filter = SWITCHABLE; sf->tx_size_search_breakout = 0; + sf->partition_search_breakout_dist_thr = 0; + sf->partition_search_breakout_rate_thr = 0; - if (oxcf->mode == REALTIME) { + if (oxcf->mode == REALTIME) set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); - } else { - if (!is_best_mode(oxcf->mode)) - set_good_speed_feature(cpi, cm, sf, oxcf->speed); - } + else if (oxcf->mode == GOOD) + set_good_speed_feature(cpi, cm, sf, oxcf->speed); cpi->full_search_sad = vp9_full_search_sad; - cpi->diamond_search_sad = is_best_mode(oxcf->mode) ? vp9_full_range_search - : vp9_diamond_search_sad; + cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search + : vp9_diamond_search_sad; cpi->refining_search_sad = vp9_refining_search_sad; @@ -434,6 +463,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (sf->mv.subpel_search_method == SUBPEL_TREE) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned; } cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.h b/source/libvpx/vp9/encoder/vp9_speed_features.h index bad956d..33c441f 100644 --- a/source/libvpx/vp9/encoder/vp9_speed_features.h +++ b/source/libvpx/vp9/encoder/vp9_speed_features.h @@ -40,6 +40,7 @@ typedef enum { typedef enum { SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Other methods to come } SUBPEL_SEARCH_METHODS; @@ -103,6 +104,12 @@ typedef enum { } MODE_SEARCH_SKIP_LOGIC; typedef enum { + FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP, + FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, + FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP, +} INTERP_FILTER_MASK; + +typedef enum { // Search partitions using RD/NONRD criterion SEARCH_PARTITION = 0, @@ -284,6 +291,9 @@ typedef struct SPEED_FEATURES { // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. int adaptive_pred_interp_filter; + // Adaptive prediction mode search + int adaptive_mode_search; + // Chessboard pattern prediction filter type search int cb_pred_filter_search; @@ -380,6 +390,16 @@ typedef struct SPEED_FEATURES { // Early termination in transform size search, which only applies while // tx_size_search_method is USE_FULL_RD. int tx_size_search_breakout; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // mask for skip evaluation of certain interp_filter type. + INTERP_FILTER_MASK interp_filter_search_mask; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; } SPEED_FEATURES; struct VP9_COMP; diff --git a/source/libvpx/vp9/encoder/vp9_ssim.c b/source/libvpx/vp9/encoder/vp9_ssim.c index 026e6a8..8435640 100644 --- a/source/libvpx/vp9/encoder/vp9_ssim.c +++ b/source/libvpx/vp9/encoder/vp9_ssim.c @@ -95,7 +95,7 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1, return ssim_total; } double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, - int lumamask, double *weight) { + double *weight) { double a, b, c; double ssimv; diff --git a/source/libvpx/vp9/encoder/vp9_ssim.h b/source/libvpx/vp9/encoder/vp9_ssim.h index a581c2c..d1dd1b7 100644 --- a/source/libvpx/vp9/encoder/vp9_ssim.h +++ b/source/libvpx/vp9/encoder/vp9_ssim.h @@ -18,7 +18,7 @@ extern "C" { #include "vpx_scale/yv12config.h" double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, - int lumamask, double *weight); + double *weight); double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v); diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c index fb52d1a..7545d87 100644 --- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -19,12 +19,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; int layer; int layer_end; - int alt_ref_idx = svc->number_spatial_layers; + int alt_ref_idx = svc->number_spatial_layers * svc->number_temporal_layers; svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; - if (svc->number_temporal_layers > 1) { + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { layer_end = svc->number_temporal_layers; } else { layer_end = svc->number_spatial_layers; @@ -36,6 +36,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { int i; lc->current_video_frame_in_layer = 0; lc->layer_size = 0; + lc->frames_from_key_frame = 0; + lc->last_frame_type = FRAME_TYPES; lrc->ni_av_qi = oxcf->worst_allowed_q; lrc->total_actual_bits = 0; lrc->total_target_vs_actual = 0; @@ -50,7 +52,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->rate_correction_factors[i] = 1.0; } - if (svc->number_temporal_layers > 1) { + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; @@ -75,7 +77,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { } // Still have extra buffer for base layer golden frame - if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES) + if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) + && alt_ref_idx < REF_FRAMES) svc->layer_context[0].gold_ref_idx = alt_ref_idx; } @@ -89,7 +92,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, int layer_end; float bitrate_alloc = 1.0; - if (svc->number_temporal_layers > 1) { + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { layer_end = svc->number_temporal_layers; } else { layer_end = svc->number_spatial_layers; @@ -99,7 +102,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; - if (svc->number_temporal_layers > 1) { + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; } else { lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; @@ -115,7 +118,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); // Update framerate-related quantities. - if (svc->number_temporal_layers > 1) { + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; } else { lc->framerate = cpi->framerate; @@ -128,16 +131,16 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } } -static LAYER_CONTEXT *get_layer_context(SVC *svc) { - return svc->number_temporal_layers > 1 ? - &svc->layer_context[svc->temporal_layer_id] : - &svc->layer_context[svc->spatial_layer_id]; +static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { + return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? + &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : + &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; } void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; - LAYER_CONTEXT *const lc = get_layer_context(svc); + LAYER_CONTEXT *const lc = get_layer_context(cpi); RATE_CONTROL *const lrc = &lc->rc; const int layer = svc->temporal_layer_id; @@ -159,7 +162,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; - LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc); + LAYER_CONTEXT *const lc = get_layer_context(cpi); RATE_CONTROL *const lrc = &lc->rc; lc->framerate = framerate; @@ -172,7 +175,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { } void vp9_restore_layer_context(VP9_COMP *const cpi) { - LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc); + LAYER_CONTEXT *const lc = get_layer_context(cpi); const int old_frame_since_key = cpi->rc.frames_since_key; const int old_frame_to_key = cpi->rc.frames_to_key; @@ -190,7 +193,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { void vp9_save_layer_context(VP9_COMP *const cpi) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; - LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc); + LAYER_CONTEXT *const lc = get_layer_context(cpi); lc->rc = cpi->rc; lc->twopass = cpi->twopass; @@ -214,15 +217,17 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { svc->spatial_layer_id = 0; } -void vp9_inc_frame_in_layer(SVC *svc) { - LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1) - ? &svc->layer_context[svc->temporal_layer_id] - : &svc->layer_context[svc->spatial_layer_id]; +void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { + LAYER_CONTEXT *const lc = + (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? + &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : + &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; ++lc->current_video_frame_in_layer; + ++lc->frames_from_key_frame; } int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { - return is_spatial_svc(cpi) && + return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 && cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame; } @@ -257,6 +262,7 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { int layer_id; vpx_svc_parameters_t *layer_param; LAYER_CONTEXT *lc; + int count = 1 << (cpi->svc.number_temporal_layers - 1); // Find the next layer to be encoded for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) { @@ -274,17 +280,36 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; + cpi->svc.temporal_layer_id = 0; + while ((lc->current_video_frame_in_layer % count) != 0) { + ++cpi->svc.temporal_layer_id; + count >>= 1; + } + + cpi->lst_fb_idx = + cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id; + if (lc->frames_from_key_frame < cpi->svc.number_temporal_layers) + cpi->ref_frame_flags &= ~VP9_LAST_FLAG; - if (cpi->svc.spatial_layer_id < 1) + if (cpi->svc.spatial_layer_id == 0) { + if (cpi->svc.temporal_layer_id == 0) cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ? lc->gold_ref_idx : cpi->lst_fb_idx; - else - cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; + else + cpi->gld_fb_idx = cpi->lst_fb_idx - 1; + } else { + if (cpi->svc.temporal_layer_id == 0) + cpi->gld_fb_idx = cpi->svc.spatial_layer_id - + cpi->svc.number_temporal_layers; + else + cpi->gld_fb_idx = cpi->lst_fb_idx - 1; + } if (lc->current_video_frame_in_layer == 0) { if (cpi->svc.spatial_layer_id >= 2) { - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; + cpi->alt_fb_idx = + cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers; } else { cpi->alt_fb_idx = cpi->lst_fb_idx; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); @@ -306,7 +331,8 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { lc_lower->alt_ref_source != NULL) cpi->alt_fb_idx = lc_lower->alt_ref_idx; else if (cpi->svc.spatial_layer_id >= 2) - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; + cpi->alt_fb_idx = + cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers; else cpi->alt_fb_idx = cpi->lst_fb_idx; } @@ -325,7 +351,7 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { vp9_set_high_precision_mv(cpi, 1); - cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source; + cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; return 0; } diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h index 801449b..1fc43a4 100644 --- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -25,9 +25,11 @@ typedef struct { double framerate; int avg_frame_size; TWO_PASS twopass; - struct vpx_fixed_buf rc_twopass_stats_in; + vpx_fixed_buf_t rc_twopass_stats_in; unsigned int current_video_frame_in_layer; int is_key_frame; + int frames_from_key_frame; + FRAME_TYPE last_frame_type; vpx_svc_parameters_t svc_params_received; struct lookahead_entry *alt_ref_source; int alt_ref_idx; @@ -80,7 +82,7 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi); void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); // Increment number of video frames in layer -void vp9_inc_frame_in_layer(SVC *svc); +void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); // Check if current layer is key frame in spatial upper layer int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi); diff --git a/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/source/libvpx/vp9/encoder/vp9_temporal_filter.c index 076d776..18a6a91 100644 --- a/source/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/source/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -145,6 +145,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int bestsme = INT_MAX; int distortion; unsigned int sse; + int sad_list[5]; MV best_ref_mv1 = {0, 0}; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ @@ -168,6 +169,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, // Ignore mv costing by sending NULL pointer instead of cost arrays vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, + cond_sad_list(cpi, sad_list), &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv); // Ignore mv costing by sending NULL pointer instead of cost array @@ -177,6 +179,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step, + cond_sad_list(cpi, sad_list), NULL, NULL, &distortion, &sse, NULL, 0, 0); @@ -188,6 +191,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, } static void temporal_filter_iterate_c(VP9_COMP *cpi, + YV12_BUFFER_CONFIG **frames, int frame_count, int alt_ref_index, int strength, @@ -203,7 +207,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3); DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3); MACROBLOCKD *mbd = &cpi->mb.e_mbd; - YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index]; + YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3); const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; @@ -247,7 +251,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, const int thresh_low = 10000; const int thresh_high = 20000; - if (cpi->frames[frame] == NULL) + if (frames[frame] == NULL) continue; mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; @@ -258,9 +262,9 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } else { // Find best match in this frame by MC int err = temporal_filter_find_matching_mb_c(cpi, - cpi->frames[alt_ref_index]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_stride); + frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, + frames[frame]->y_stride); // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior @@ -272,10 +276,10 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (filter_weight != 0) { // Construct the predictors temporal_filter_predictors_mb_c(mbd, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->u_buffer + mb_uv_offset, - cpi->frames[frame]->v_buffer + mb_uv_offset, - cpi->frames[frame]->y_stride, + frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, + frames[frame]->y_stride, mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, @@ -429,6 +433,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { int frames_to_blur_backward; int frames_to_blur_forward; struct scale_factors sf; + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL}; // Apply context specific adjustments to the arnr filter parameters. adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); @@ -437,16 +442,15 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { start_frame = distance + frames_to_blur_forward; // Setup frame pointers, NULL indicates frame not included in filter. - vp9_zero(cpi->frames); for (frame = 0; frame < frames_to_blur; ++frame) { const int which_buffer = start_frame - frame; struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, which_buffer); - cpi->frames[frames_to_blur - 1 - frame] = &buf->img; + frames[frames_to_blur - 1 - frame] = &buf->img; } // Setup scaling factors. Scaling on each of the arnr frames is not supported - if (is_spatial_svc(cpi)) { + if (is_two_pass_svc(cpi)) { // In spatial svc the scaling factors might be less then 1/2. So we will use // non-normative scaling. int frame_used = 0; @@ -457,19 +461,21 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { get_frame_new_buffer(cm)->y_crop_height); for (frame = 0; frame < frames_to_blur; ++frame) { - if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width || - cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) { + if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || + cm->mi_rows * MI_SIZE != frames[frame]->y_height) { if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate alt_ref_buffer"); - cpi->frames[frame] = - vp9_scale_if_required(cm, cpi->frames[frame], - &cpi->svc.scaled_frames[frame_used]); + frames[frame] = vp9_scale_if_required(cm, frames[frame], + &cpi->svc.scaled_frames[frame_used]); ++frame_used; } } @@ -480,6 +486,6 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { cm->width, cm->height); } - temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, - strength, &sf); + temporal_filter_iterate_c(cpi, frames, frames_to_blur, + frames_to_blur_backward, strength, &sf); } diff --git a/source/libvpx/vp9/encoder/vp9_variance.c b/source/libvpx/vp9/encoder/vp9_variance.c index eb5ae2e..afbb191 100644 --- a/source/libvpx/vp9/encoder/vp9_variance.c +++ b/source/libvpx/vp9/encoder/vp9_variance.c @@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; - for (i = 0; i < 256; i++) + for (i = 0; i < 256; ++i) { sum += src_ptr[i] * src_ptr[i]; + } return sum; } diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index b6bcdd9..e799951 100644 --- a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,6 +12,8 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" + void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { __m128i in0, in1; __m128i tmp; @@ -780,58 +782,6 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); @@ -1953,23 +1903,6 @@ static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); diff --git a/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm b/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm deleted file mode 100644 index 32fdd23..0000000 --- a/source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm +++ /dev/null @@ -1,427 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -global sym(vp9_sad16x16_mmx) PRIVATE -global sym(vp9_sad8x16_mmx) PRIVATE -global sym(vp9_sad8x8_mmx) PRIVATE -global sym(vp9_sad4x4_mmx) PRIVATE -global sym(vp9_sad16x8_mmx) PRIVATE - -;unsigned int vp9_sad16x16_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x16x16sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm2, QWORD PTR [rsi+8] - - movq mm1, QWORD PTR [rdi] - movq mm3, QWORD PTR [rdi+8] - - movq mm4, mm0 - movq mm5, mm2 - - psubusb mm0, mm1 - psubusb mm1, mm4 - - psubusb mm2, mm3 - psubusb mm3, mm5 - - por mm0, mm1 - por mm2, mm3 - - movq mm1, mm0 - movq mm3, mm2 - - punpcklbw mm0, mm6 - punpcklbw mm2, mm6 - - punpckhbw mm1, mm6 - punpckhbw mm3, mm6 - - paddw mm0, mm2 - paddw mm1, mm3 - - - lea rsi, [rsi+rax] - add rdi, rdx - - paddw mm7, mm0 - paddw mm7, mm1 - - cmp rsi, rcx - jne .x16x16sad_mmx_loop - - - movq mm0, mm7 - - punpcklwd mm0, mm6 - punpckhwd mm7, mm6 - - paddw mm0, mm7 - movq mm7, mm0 - - - psrlq mm0, 32 - paddw mm7, mm0 - - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x16_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad8x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x8x16sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - punpcklbw mm0, mm6 - - punpckhbw mm2, mm6 - lea rsi, [rsi+rax] - - add rdi, rdx - paddw mm7, mm0 - - paddw mm7, mm2 - cmp rsi, rcx - - jne .x8x16sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x8_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x8x8sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - punpcklbw mm0, mm6 - - punpckhbw mm2, mm6 - paddw mm0, mm2 - - lea rsi, [rsi+rax] - add rdi, rdx - - paddw mm7, mm0 - cmp rsi, rcx - - jne .x8x8sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad4x4_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - movd mm0, DWORD PTR [rsi] - movd mm1, DWORD PTR [rdi] - - movd mm2, DWORD PTR [rsi+rax] - movd mm3, DWORD PTR [rdi+rdx] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - pxor mm3, mm3 - - punpcklbw mm0, mm3 - punpckhbw mm2, mm3 - - paddw mm0, mm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movd mm4, DWORD PTR [rsi] - movd mm5, DWORD PTR [rdi] - - movd mm6, DWORD PTR [rsi+rax] - movd mm7, DWORD PTR [rdi+rdx] - - punpcklbw mm4, mm6 - punpcklbw mm5, mm7 - - movq mm6, mm4 - psubusb mm4, mm5 - - psubusb mm5, mm6 - por mm4, mm5 - - movq mm5, mm4 - punpcklbw mm4, mm3 - - punpckhbw mm5, mm3 - paddw mm4, mm5 - - paddw mm0, mm4 - movq mm1, mm0 - - punpcklwd mm0, mm3 - punpckhwd mm1, mm3 - - paddw mm0, mm1 - movq mm1, mm0 - - psrlq mm0, 32 - paddw mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad16x8_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad16x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x16x8sad_mmx_loop: - - movq mm0, [rsi] - movq mm1, [rdi] - - movq mm2, [rsi+8] - movq mm3, [rdi+8] - - movq mm4, mm0 - movq mm5, mm2 - - psubusb mm0, mm1 - psubusb mm1, mm4 - - psubusb mm2, mm3 - psubusb mm3, mm5 - - por mm0, mm1 - por mm2, mm3 - - movq mm1, mm0 - movq mm3, mm2 - - punpcklbw mm0, mm6 - punpckhbw mm1, mm6 - - punpcklbw mm2, mm6 - punpckhbw mm3, mm6 - - - paddw mm0, mm2 - paddw mm1, mm3 - - paddw mm0, mm1 - lea rsi, [rsi+rax] - - add rdi, rdx - paddw mm7, mm0 - - cmp rsi, rcx - jne .x16x8sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c index 7f81f46..ea09b95 100644 --- a/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c +++ b/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c @@ -12,67 +12,39 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*get_var_avx2) ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -void vp9_get16x16var_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -void vp9_get32x32var_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp9_sub_pixel_variance32xh_avx2 -( - const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - int height, - unsigned int *sse -); - -unsigned int vp9_sub_pixel_avg_variance32xh_avx2 -( - const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sseptr -); - -static void variance_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { - unsigned int sse0; - int sum0; +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, + unsigned int *sse); + +unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sseptr); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { int i, j; *sse = 0; @@ -80,105 +52,68 @@ static void variance_avx2(const unsigned char *src_ptr, int source_stride, for (i = 0; i < h; i += 16) { for (j = 0; j < w; j += block_size) { - // processing 16 rows horizontally each call - var_fn(src_ptr + source_stride * i + j, source_stride, - ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } -unsigned int vp9_variance16x16_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, - &var, &avg, vp9_get16x16var_avx2, 16); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); +unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); } -unsigned int vp9_mse16x16_avx2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0; - int sum0; - vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - *sse = sse0; - return sse0; +unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; } -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); } - -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); } -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, - &var, &avg, vp9_get32x32var_avx2, 32); - - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); } unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, @@ -187,22 +122,19 @@ unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sse_ptr) { - // processing 32 elements in parallel - unsigned int sse; - int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 64, &sse); - // processing the next 32 elements in parallel + unsigned int *sse) { + unsigned int sse1; + const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 64, &sse1); unsigned int sse2; - int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, - x_offset, y_offset, - dst + 32, dst_stride, - 64, &sse2); - se += se2; - sse += sse2; - *sse_ptr = sse; - return sse - (((int64_t)se * se) >> 12); + const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, + x_offset, y_offset, + dst + 32, dst_stride, + 64, &sse2); + const int se = se1 + se2; + *sse = sse1 + sse2; + return *sse - (((int64_t)se * se) >> 12); } unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, @@ -211,14 +143,11 @@ unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sse_ptr) { - // processing 32 element in parallel - unsigned int sse; - int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 32, &sse); - *sse_ptr = sse; - return sse - (((int64_t)se * se) >> 10); + unsigned int *sse) { + const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 32, sse); + return *sse - (((int64_t)se * se) >> 10); } unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, @@ -227,24 +156,22 @@ unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sseptr, + unsigned int *sse, const uint8_t *sec) { - // processing 32 elements in parallel - unsigned int sse; - - int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 64, 64, &sse); + unsigned int sse1; + const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 64, 64, &sse1); unsigned int sse2; - // processing the next 32 elements in parallel - int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, - y_offset, dst + 32, dst_stride, - sec + 32, 64, 64, &sse2); - se += se2; - sse += sse2; - *sseptr = sse; + const int se2 = + vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, + y_offset, dst + 32, dst_stride, + sec + 32, 64, 64, &sse2); + const int se = se1 + se2; - return sse - (((int64_t)se * se) >> 12); + *sse = sse1 + sse2; + + return *sse - (((int64_t)se * se) >> 12); } unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, @@ -253,15 +180,11 @@ unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sseptr, + unsigned int *sse, const uint8_t *sec) { // processing 32 element in parallel - unsigned int sse; - int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 32, 32, &sse); - *sseptr = sse; - return sse - (((int64_t)se * se) >> 10); + const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 32, 32, sse); + return *sse - (((int64_t)se * se) >> 10); } - - diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm deleted file mode 100644 index 3501cf1..0000000 --- a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ /dev/null @@ -1,510 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) -global sym(vp9_get_mb_ss_mmx) PRIVATE -sym(vp9_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get8x8var_mmx) PRIVATE -sym(vp9_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get4x4var_mmx) PRIVATE -sym(vp9_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy 4 bytes to mm0 - movd mm1, [rbx] ; Copy 4 bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy 4 bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp9_get4x4sse_cs_mmx) PRIVATE -sym(vp9_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm deleted file mode 100644 index 4830412..0000000 --- a/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ /dev/null @@ -1,401 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp9_get_mb_ss_sse2) PRIVATE -sym(vp9_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp9_get16x16var_sse2) PRIVATE -sym(vp9_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp9_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp9_get8x8var_sse2) PRIVATE -sym(vp9_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c deleted file mode 100644 index ce1c832..0000000 --- a/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vpx_ports/mem.h" - -unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *SSE, int *sum); - -unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 4); -} - -unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 6); -} - -unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3; - int sum0, sum1, sum2, sum3; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, - ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); - - *sse = sse0 + sse1 + sse2 + sse3; - return *sse; -} - - -unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3; - int sum0, sum1, sum2, sum3, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, - ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); - - *sse = sse0 + sse1 + sse2 + sse3; - sum = sum0 + sum1 + sum2 + sum3; - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1; - int sum0, sum1, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - - *sse = sse0 + sse1; - sum = sum0 + sum1; - return *sse - (((unsigned int)sum * sum) >> 7); -} - - -unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1; - int sum0, sum1, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse1, &sum1); - - *sse = sse0 + sse1; - sum = sum0 + sum1; - return *sse - (((unsigned int)sum * sum) >> 7); -} diff --git a/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index e935a23..b4d2b0a 100644 --- a/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" @@ -17,18 +19,137 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse, int *sum); -unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} -unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); + + return 0; +} + +unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} + +unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} -unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); static void variance_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, @@ -55,8 +176,7 @@ unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 4, - sse, &sum, vp9_get4x4var_mmx, 4); + vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); return *sse - (((unsigned int)sum * sum) >> 4); } @@ -65,7 +185,7 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, vp9_get4x4var_mmx, 4); + sse, &sum, vp9_get4x4var_sse2, 4); return *sse - (((unsigned int)sum * sum) >> 5); } @@ -74,7 +194,7 @@ unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, vp9_get4x4var_mmx, 4); + sse, &sum, vp9_get4x4var_sse2, 4); return *sse - (((unsigned int)sum * sum) >> 5); } @@ -82,8 +202,7 @@ unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_get8x8var_sse2, 8); + vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); return *sse - (((unsigned int)sum * sum) >> 6); } @@ -109,17 +228,8 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; + return *sse - (((unsigned int)sum * sum) >> 8); } unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, @@ -176,6 +286,34 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, return *sse - (((int64_t)sum * sum) >> 11); } +unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + #define DECL(w, opt) \ int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ ptrdiff_t src_stride, \ diff --git a/source/libvpx/vp9/vp9_common.mk b/source/libvpx/vp9/vp9_common.mk index 8e3e885..90f0342 100644 --- a/source/libvpx/vp9/vp9_common.mk +++ b/source/libvpx/vp9/vp9_common.mk @@ -80,7 +80,6 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_POSTPROC),yes) -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif diff --git a/source/libvpx/vp9/vp9_cx_iface.c b/source/libvpx/vp9/vp9_cx_iface.c index 1716053..0f0b7a5 100644 --- a/source/libvpx/vp9/vp9_cx_iface.c +++ b/source/libvpx/vp9/vp9_cx_iface.c @@ -21,7 +21,6 @@ #include "vp9/vp9_iface_common.h" struct vp9_extracfg { - struct vpx_codec_pkt_list *pkt_list; int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; unsigned int noise_sensitivity; @@ -31,7 +30,6 @@ struct vp9_extracfg { unsigned int tile_rows; unsigned int arnr_max_frames; unsigned int arnr_strength; - unsigned int arnr_type; vp8e_tuning tuning; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; @@ -39,41 +37,29 @@ struct vp9_extracfg { unsigned int frame_parallel_decoding_mode; AQ_MODE aq_mode; unsigned int frame_periodic_boost; - BIT_DEPTH bit_depth; + vpx_bit_depth_t bit_depth; vp9e_tune_content content; }; -struct extraconfig_map { - unsigned int usage; - struct vp9_extracfg cfg; -}; - -static const struct extraconfig_map extracfg_map[] = { - { - 0, - { // NOLINT - NULL, - 0, // cpu_used - 1, // enable_auto_alt_ref - 0, // noise_sensitivity - 0, // sharpness - 0, // static_thresh - 0, // tile_columns - 0, // tile_rows - 7, // arnr_max_frames - 5, // arnr_strength - 3, // arnr_type - VP8_TUNE_PSNR, // tuning - 10, // cq_level - 0, // rc_max_intra_bitrate_pct - 0, // lossless - 0, // frame_parallel_decoding_mode - NO_AQ, // aq_mode - 0, // frame_periodic_delta_q - BITS_8, // Bit depth - VP9E_CONTENT_DEFAULT // content - } - } +static struct vp9_extracfg default_extra_cfg = { + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 0, // tile_columns + 0, // tile_rows + 7, // arnr_max_frames + 5, // arnr_strength + VP8_TUNE_PSNR, // tuning + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // lossless + 0, // frame_parallel_decoding_mode + NO_AQ, // aq_mode + 0, // frame_periodic_delta_q + VPX_BITS_8, // Bit depth + VP9E_CONTENT_DEFAULT // content }; struct vpx_codec_alg_priv { @@ -177,20 +163,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); - -#if CONFIG_SPATIAL_SVC - if (cfg->ss_number_layers > 1) { - unsigned int i, alt_ref_sum = 0; - for (i = 0; i < cfg->ss_number_layers; ++i) { - if (cfg->ss_enable_auto_alt_ref[i]) - ++alt_ref_sum; - } - if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) - ERROR("Not enough ref buffers for svc alt ref frames"); - } -#endif - RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); + if (cfg->ts_number_layers > 1) { unsigned int i; for (i = 1; i < cfg->ts_number_layers; ++i) @@ -203,6 +177,28 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("ts_rate_decimator factors are not powers of 2"); } +#if CONFIG_SPATIAL_SVC + if (cfg->ss_number_layers * cfg->ts_number_layers > REF_FRAMES) + ERROR("Too many layers. Maximum 8 layers could be set"); + + if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) && + cfg->g_pass == VPX_RC_LAST_PASS) { + unsigned int i, alt_ref_sum = 0; + for (i = 0; i < cfg->ss_number_layers; ++i) { + if (cfg->ss_enable_auto_alt_ref[i]) + ++alt_ref_sum; + } + if (alt_ref_sum > + REF_FRAMES - cfg->ss_number_layers * cfg->ts_number_layers) + ERROR("Not enough ref buffers for svc alt ref frames"); + if ((cfg->ss_number_layers > 3 || + cfg->ss_number_layers * cfg->ts_number_layers > 4) && + cfg->g_error_resilient == 0) + ERROR("Multiple frame context are not supported for more than 3 spatial " + "layers or more than 4 spatial x temporal layers"); + } +#endif + // VP9 does not support a lower bound on the keyframe interval in // automatic keyframe placement mode. if (cfg->kf_mode != VPX_KF_DISABLED && @@ -219,8 +215,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, sharpness, 7); RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15); RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); - RANGE_CHECK(extra_cfg, arnr_type, 1, 3); RANGE_CHECK(extra_cfg, cq_level, 0, 63); + RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12); + RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); @@ -239,7 +236,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, if (cfg->rc_twopass_stats_in.sz % packet_sz) ERROR("rc_twopass_stats_in.sz indicates truncated packet."); - if (cfg->ss_number_layers > 1) { + if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) { int i; unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0}; @@ -279,12 +276,16 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } } +#if !CONFIG_VP9_HIGHBITDEPTH + if (cfg->g_profile > (unsigned int)PROFILE_1) + ERROR("Profile > 1 not supported in this build configuration"); +#endif if (cfg->g_profile <= (unsigned int)PROFILE_1 && - extra_cfg->bit_depth > BITS_8) - ERROR("High bit-depth not supported in profile < 2"); + extra_cfg->bit_depth > VPX_BITS_8) + ERROR("Codec high bit-depth not supported in profile < 2"); if (cfg->g_profile > (unsigned int)PROFILE_1 && - extra_cfg->bit_depth == BITS_8) - ERROR("Bit-depth 8 not supported in profile > 1"); + extra_cfg->bit_depth == VPX_BITS_8) + ERROR("Codec bit-depth 8 not supported in profile > 1"); return VPX_CODEC_OK; } @@ -316,6 +317,9 @@ static int get_image_bps(const vpx_image_t *img) { case VPX_IMG_FMT_I420: return 12; case VPX_IMG_FMT_I422: return 16; case VPX_IMG_FMT_I444: return 24; + case VPX_IMG_FMT_I42016: return 24; + case VPX_IMG_FMT_I42216: return 32; + case VPX_IMG_FMT_I44416: return 48; default: assert(0 && "Invalid image format"); break; } return 0; @@ -330,12 +334,13 @@ static vpx_codec_err_t set_encoder_config( oxcf->width = cfg->g_w; oxcf->height = cfg->g_h; oxcf->bit_depth = extra_cfg->bit_depth; + oxcf->input_bit_depth = cfg->g_input_bit_depth; // guess a frame rate if out of whack, use 30 oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; if (oxcf->init_framerate > 180) oxcf->init_framerate = 30; - oxcf->mode = BEST; + oxcf->mode = GOOD; switch (cfg->g_pass) { case VPX_RC_ONE_PASS: @@ -393,7 +398,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->sharpness = extra_cfg->sharpness; oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; - oxcf->output_pkt_list = extra_cfg->pkt_list; #if CONFIG_FP_MB_STATS oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; @@ -401,7 +405,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; oxcf->arnr_strength = extra_cfg->arnr_strength; - oxcf->arnr_type = extra_cfg->arnr_type; oxcf->tuning = extra_cfg->tuning; oxcf->content = extra_cfg->content; @@ -428,6 +431,9 @@ static vpx_codec_err_t set_encoder_config( } } else if (oxcf->ss_number_layers == 1) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; +#if CONFIG_SPATIAL_SVC + oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref; +#endif } oxcf->ts_number_layers = cfg->ts_number_layers; @@ -597,9 +603,9 @@ static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx, va_list args) { - struct vp9_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args); - return update_extra_cfg(ctx, &extra_cfg); + (void)ctx; + (void)args; + return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx, @@ -659,51 +665,32 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, (void)data; if (ctx->priv == NULL) { - int i; - vpx_codec_enc_cfg_t *cfg; - struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv)); - + vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv)); if (priv == NULL) return VPX_CODEC_MEM_ERROR; - ctx->priv = &priv->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->alg_priv = priv; + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; ctx->priv->enc.total_encoders = 1; if (ctx->config.enc) { // Update the reference to the config structure to an internal copy. - ctx->priv->alg_priv->cfg = *ctx->config.enc; - ctx->config.enc = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; } - cfg = &ctx->priv->alg_priv->cfg; - - // Select the extra vp6 configuration table based on the current - // usage value. If the current usage value isn't found, use the - // values for usage case 0. - for (i = 0; - extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - ++i) {} - - priv->extra_cfg = extracfg_map[i].cfg; - priv->extra_cfg.pkt_list = &priv->pkt_list.head; - + priv->extra_cfg = default_extra_cfg; vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); if (res == VPX_CODEC_OK) { - VP9_COMP *cpi; - set_encoder_config(&ctx->priv->alg_priv->oxcf, - &ctx->priv->alg_priv->cfg, - &ctx->priv->alg_priv->extra_cfg); - cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf); - if (cpi == NULL) + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); + priv->cpi = vp9_create_compressor(&priv->oxcf); + if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; else - ctx->priv->alg_priv->cpi = cpi; + priv->cpi->output_pkt_list = &priv->pkt_list.head; } } @@ -713,7 +700,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); vp9_remove_compressor(ctx->cpi); - free(ctx); + vpx_free(ctx); return VPX_CODEC_OK; } @@ -825,6 +812,23 @@ static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; } +static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, + unsigned int lib_flags) { + vpx_codec_frame_flags_t flags = lib_flags << 16; + + if (lib_flags & FRAMEFLAGS_KEY +#if CONFIG_SPATIAL_SVC + || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame) +#endif + ) + flags |= VPX_FRAME_IS_KEY; + + if (cpi->droppable) + flags |= VPX_FRAME_IS_DROPPABLE; + + return flags; +} + static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, @@ -832,18 +836,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; + VP9_COMP *const cpi = ctx->cpi; const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; if (img != NULL) { res = validate_img(ctx, img); // TODO(jzern) the checks related to cpi's validity should be treated as a // failure condition, encoder setup is done fully in init() currently. - if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) { + if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) { // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * - (ctx->cpi->multi_arf_allowed ? 8 : 2); + (cpi->multi_arf_allowed ? 8 : 2); if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096; ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); @@ -863,7 +868,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } - vp9_apply_encoding_flags(ctx->cpi, flags); + vp9_apply_encoding_flags(cpi, flags); // Handle fixed keyframe intervals if (ctx->cfg.kf_mode == VPX_KF_AUTO && @@ -875,7 +880,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } // Initialize the encoder instance on the first frame. - if (res == VPX_CODEC_OK && ctx->cpi != NULL) { + if (res == VPX_CODEC_OK && cpi != NULL) { unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); @@ -886,16 +891,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) - ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1; + cpi->b_calculate_psnr = 1; if (img != NULL) { res = image2yuvconfig(img, &sd); // Store the original flags in to the frame buffer. Will extract the // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(ctx->cpi, flags, + if (vp9_receive_raw_frame(cpi, flags, &sd, dst_time_stamp, dst_end_time_stamp)) { - VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; res = update_error_state(ctx, &cpi->common.error); } } @@ -920,22 +924,21 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } while (cx_data_sz >= ctx->cx_data_sz / 2 && - -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size, + -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { if (size) { - VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; vpx_codec_cx_pkt_t pkt; #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) + if (is_two_pass_svc(cpi)) cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size; #endif // Pack invisible frames with the next visible frame - if (cpi->common.show_frame == 0 + if (!cpi->common.show_frame #if CONFIG_SPATIAL_SVC - || (is_spatial_svc(cpi) && + || (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) #endif ) { @@ -955,30 +958,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(timebase, dst_end_time_stamp - dst_time_stamp); - pkt.data.frame.flags = lib_flags << 16; - - if (lib_flags & FRAMEFLAGS_KEY -#if CONFIG_SPATIAL_SVC - || (is_spatial_svc(cpi) && - cpi->svc.layer_context[0].is_key_frame) -#endif - ) - pkt.data.frame.flags |= VPX_FRAME_IS_KEY; - - if (cpi->common.show_frame == 0) { - pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE; - - // This timestamp should be as close as possible to the - // prior PTS so that if a decoder uses pts to schedule when - // to do this, we start right after last frame was decoded. - // Invisible frames have no duration. - pkt.data.frame.pts = - ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1; - pkt.data.frame.duration = 0; - } - - if (cpi->droppable) - pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE; + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); if (ctx->pending_cx_data) { ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; @@ -1000,9 +980,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; #if CONFIG_SPATIAL_SVC - if (is_spatial_svc(cpi)) { - vpx_codec_cx_pkt_t pkt = {0}; + if (is_two_pass_svc(cpi)) { + vpx_codec_cx_pkt_t pkt; int i; + vp9_zero(pkt); pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size; @@ -1289,6 +1270,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 320, // g_width 240, // g_height + VPX_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + {1, 30}, // g_timebase 0, // g_error_resilient @@ -1354,11 +1338,11 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { encoder_destroy, // vpx_codec_destroy_fn_t encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t { // NOLINT - NOT_IMPLEMENTED, // vpx_codec_peek_si_fn_t - NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t - NOT_IMPLEMENTED, // vpx_codec_decode_fn_t - NOT_IMPLEMENTED, // vpx_codec_frame_get_fn_t - NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t + NULL, // vpx_codec_peek_si_fn_t + NULL, // vpx_codec_get_si_fn_t + NULL, // vpx_codec_decode_fn_t + NULL, // vpx_codec_frame_get_fn_t + NULL // vpx_codec_set_fb_fn_t }, { // NOLINT 1, // 1 cfg map @@ -1366,8 +1350,8 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { encoder_encode, // vpx_codec_encode_fn_t encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t encoder_set_config, // vpx_codec_enc_config_set_fn_t - NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t + NULL, // vpx_codec_get_global_headers_fn_t encoder_get_preview, // vpx_codec_get_preview_frame_fn_t - NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/source/libvpx/vp9/vp9_dx_iface.c b/source/libvpx/vp9/vp9_dx_iface.c index bb2bb10..393c66e 100644 --- a/source/libvpx/vp9/vp9_dx_iface.c +++ b/source/libvpx/vp9/vp9_dx_iface.c @@ -58,28 +58,22 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, (void)data; if (!ctx->priv) { - vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv)); - if (alg_priv == NULL) + vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) return VPX_CODEC_MEM_ERROR; - vp9_zero(*alg_priv); - - ctx->priv = (vpx_codec_priv_t *)alg_priv; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->alg_priv = alg_priv; - ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; - ctx->priv->alg_priv->flushed = 0; - ctx->priv->alg_priv->frame_parallel_decode = - (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); - // Disable frame parallel decoding for now. - ctx->priv->alg_priv->frame_parallel_decode = 0; + priv->si.sz = sizeof(priv->si); + priv->flushed = 0; + priv->frame_parallel_decode = + (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); + priv->frame_parallel_decode = 0; // Disable for now if (ctx->config.dec) { - // Update the reference to the config structure to an internal copy. - ctx->priv->alg_priv->cfg = *ctx->config.dec; - ctx->config.dec = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; } } @@ -443,6 +437,7 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, // call to get_frame. if (!(*iter)) { img = &ctx->img; + img->bit_depth = (int)ctx->pbi->common.bit_depth; *iter = img; } } @@ -591,6 +586,23 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, } } +static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + + if (bit_depth) { + if (ctx->pbi) { + const VP9_COMMON *const cm = &ctx->pbi->common; + *bit_depth = cm->bit_depth; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx, va_list args) { ctx->invert_tile_order = va_arg(args, int); @@ -623,6 +635,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted}, {VP9_GET_REFERENCE, ctrl_get_reference}, {VP9D_GET_DISPLAY_SIZE, ctrl_get_display_size}, + {VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth}, { -1, NULL}, }; @@ -647,12 +660,12 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { }, { // NOLINT 0, - NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t - NOT_IMPLEMENTED, // vpx_codec_encode_fn_t - NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t - NOT_IMPLEMENTED, // vpx_codec_enc_config_set_fn_t - NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t - NOT_IMPLEMENTED, // vpx_codec_get_preview_frame_fn_t - NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t + NULL, // vpx_codec_enc_cfg_map_t + NULL, // vpx_codec_encode_fn_t + NULL, // vpx_codec_get_cx_data_fn_t + NULL, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + NULL, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/source/libvpx/vp9/vp9cx.mk b/source/libvpx/vp9/vp9cx.mk index dc46c4e..e450f7b 100644 --- a/source/libvpx/vp9/vp9cx.mk +++ b/source/libvpx/vp9/vp9cx.mk @@ -93,10 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c diff --git a/source/libvpx/vpx/internal/vpx_codec_internal.h b/source/libvpx/vpx/internal/vpx_codec_internal.h index 95119df..cbfffd0 100644 --- a/source/libvpx/vpx/internal/vpx_codec_internal.h +++ b/source/libvpx/vpx/internal/vpx_codec_internal.h @@ -286,8 +286,6 @@ typedef const struct vpx_codec_enc_cfg_map { vpx_codec_enc_cfg_t cfg; } vpx_codec_enc_cfg_map_t; -#define NOT_IMPLEMENTED 0 - /*!\brief Decoder algorithm interface interface * * All decoders \ref MUST expose a variable of this type. @@ -337,8 +335,6 @@ typedef struct vpx_codec_priv_cb_pair { * and the pointer cast to the proper type. */ struct vpx_codec_priv { - unsigned int sz; - struct vpx_codec_alg_priv *alg_priv; const char *err_detail; vpx_codec_flags_t init_flags; struct { @@ -346,7 +342,7 @@ struct vpx_codec_priv { vpx_codec_priv_cb_pair_t put_slice_cb; } dec; struct { - struct vpx_fixed_buf cx_data_dst_buf; + vpx_fixed_buf_t cx_data_dst_buf; unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; diff --git a/source/libvpx/vpx/src/svc_encodeframe.c b/source/libvpx/vpx/src/svc_encodeframe.c index 7828615..8911e83 100644 --- a/source/libvpx/vpx/src/svc_encodeframe.c +++ b/source/libvpx/vpx/src/svc_encodeframe.c @@ -86,6 +86,7 @@ typedef struct SvcInternal { int layers; int layer; int is_keyframe; + int use_multiple_frame_contexts; FrameData *frame_list; FrameData *frame_temp; @@ -366,6 +367,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { char *option_name; char *option_value; char *input_ptr; + SvcInternal *const si = get_svc_internal(svc_ctx); vpx_codec_err_t res = VPX_CODEC_OK; if (options == NULL) return VPX_CODEC_OK; @@ -382,8 +384,10 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { res = VPX_CODEC_INVALID_PARAM; break; } - if (strcmp("layers", option_name) == 0) { + if (strcmp("spatial-layers", option_name) == 0) { svc_ctx->spatial_layers = atoi(option_value); + } else if (strcmp("temporal-layers", option_name) == 0) { + svc_ctx->temporal_layers = atoi(option_value); } else if (strcmp("scale-factors", option_name) == 0) { res = parse_scale_factors(svc_ctx, option_value); if (res != VPX_CODEC_OK) break; @@ -393,6 +397,8 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { } else if (strcmp("auto-alt-refs", option_name) == 0) { res = parse_auto_alt_ref(svc_ctx, option_value); if (res != VPX_CODEC_OK) break; + } else if (strcmp("multi-frame-contexts", option_name) == 0) { + si->use_multiple_frame_contexts = atoi(option_value); } else { svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name); res = VPX_CODEC_INVALID_PARAM; @@ -401,6 +407,12 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { option_name = strtok_r(NULL, "=", &input_ptr); } free(input_string); + + if (si->use_multiple_frame_contexts && + (svc_ctx->spatial_layers > 3 || + svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4)) + res = VPX_CODEC_INVALID_PARAM; + return res; } @@ -480,6 +492,16 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, res = parse_options(svc_ctx, si->options); if (res != VPX_CODEC_OK) return res; + if (svc_ctx->spatial_layers < 1) + svc_ctx->spatial_layers = 1; + if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) + svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS; + + if (svc_ctx->temporal_layers < 1) + svc_ctx->temporal_layers = 1; + if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS) + svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; + si->layers = svc_ctx->spatial_layers; // Assign target bitrate for each layer. We calculate the ratio @@ -515,9 +537,18 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i]; #endif + if (svc_ctx->temporal_layers > 1) { + int i; + for (i = 0; i < svc_ctx->temporal_layers; ++i) { + enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate / + svc_ctx->temporal_layers; + enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i); + } + } + // modify encoder configuration enc_cfg->ss_number_layers = si->layers; - enc_cfg->ts_number_layers = 1; // Temporal layers not used in this encoder. + enc_cfg->ts_number_layers = svc_ctx->temporal_layers; // TODO(ivanmaltz): determine if these values need to be set explicitly for // svc, or if the normal default/override mechanism can be used @@ -534,7 +565,8 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, enc_cfg->rc_buf_initial_sz = 500; enc_cfg->rc_buf_optimal_sz = 600; enc_cfg->rc_buf_sz = 1000; - enc_cfg->g_error_resilient = 1; + if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) + enc_cfg->g_error_resilient = 1; // Initialize codec res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR); diff --git a/source/libvpx/vpx/src/vpx_codec.c b/source/libvpx/vpx/src/vpx_codec.c index d175eae..5a495ce 100644 --- a/source/libvpx/vpx/src/vpx_codec.c +++ b/source/libvpx/vpx/src/vpx_codec.c @@ -88,8 +88,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; else { - if (ctx->priv->alg_priv) - ctx->iface->destroy(ctx->priv->alg_priv); + ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv); ctx->iface = NULL; ctx->name = NULL; @@ -125,7 +124,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, va_list ap; va_start(ap, ctrl_id); - res = entry->fn(ctx->priv->alg_priv, ap); + res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap); va_end(ap); break; } diff --git a/source/libvpx/vpx/src/vpx_decoder.c b/source/libvpx/vpx/src/vpx_decoder.c index b19c440..802d8ed 100644 --- a/source/libvpx/vpx/src/vpx_decoder.c +++ b/source/libvpx/vpx/src/vpx_decoder.c @@ -18,9 +18,13 @@ #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_dec_cfg_t *cfg, + const vpx_codec_dec_cfg_t *cfg, vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; @@ -94,7 +98,7 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, si->w = 0; si->h = 0; - res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si); + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); } return SAVE_STATUS(ctx, res); @@ -115,8 +119,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; else { - res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz, - user_priv, deadline); + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv, + deadline); } return SAVE_STATUS(ctx, res); @@ -129,7 +133,7 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, if (!ctx || !iter || !ctx->iface || !ctx->priv) img = NULL; else - img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter); + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); return img; } @@ -185,7 +189,7 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions( !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { res = VPX_CODEC_ERROR; } else { - res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release, + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, cb_priv); } diff --git a/source/libvpx/vpx/src/vpx_encoder.c b/source/libvpx/vpx/src/vpx_encoder.c index 5773455..1903b55 100644 --- a/source/libvpx/vpx/src/vpx_encoder.c +++ b/source/libvpx/vpx/src/vpx_encoder.c @@ -20,9 +20,13 @@ #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, + const vpx_codec_enc_cfg_t *cfg, vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; @@ -216,7 +220,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, FLOATING_POINT_INIT(); if (num_enc == 1) - res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, deadline); else { /* Multi-resolution encoding: @@ -230,7 +234,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, if (img) img += num_enc - 1; for (i = num_enc - 1; i >= 0; i--) { - if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, deadline))) break; @@ -259,7 +263,7 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) ctx->err = VPX_CODEC_INCAPABLE; else - pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter); + pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); } if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) { @@ -327,7 +331,7 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { else if (!ctx->iface->enc.get_preview) ctx->err = VPX_CODEC_INCAPABLE; else - img = ctx->iface->enc.get_preview(ctx->priv->alg_priv); + img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); } return img; @@ -345,7 +349,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { else if (!ctx->iface->enc.get_glob_hdrs) ctx->err = VPX_CODEC_INCAPABLE; else - buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv); + buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); } return buf; @@ -361,7 +365,7 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; else - res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg); + res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); return SAVE_STATUS(ctx, res); } diff --git a/source/libvpx/vpx/src/vpx_image.c b/source/libvpx/vpx/src/vpx_image.c index e20703a..e58b61e 100644 --- a/source/libvpx/vpx/src/vpx_image.c +++ b/source/libvpx/vpx/src/vpx_image.c @@ -154,7 +154,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, goto fail; img->fmt = fmt; - img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8; + img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; img->w = w; img->h = h; img->x_chroma_shift = xcs; diff --git a/source/libvpx/vpx/svc_context.h b/source/libvpx/vpx/svc_context.h index e0de263..eea3b13 100644 --- a/source/libvpx/vpx/svc_context.h +++ b/source/libvpx/vpx/svc_context.h @@ -31,7 +31,8 @@ typedef enum SVC_LOG_LEVEL { typedef struct { // public interface to svc_command options - int spatial_layers; // number of layers + int spatial_layers; // number of spatial layers + int temporal_layers; // number of temporal layers SVC_LOG_LEVEL log_level; // amount of information to display int log_print; // when set, printf log messages instead of returning the // message with svc_get_message diff --git a/source/libvpx/vpx/vp8dx.h b/source/libvpx/vpx/vp8dx.h index bd7f19c..379b306 100644 --- a/source/libvpx/vpx/vp8dx.h +++ b/source/libvpx/vpx/vp8dx.h @@ -75,6 +75,9 @@ enum vp8_dec_control_id { /** control function to get the display dimensions for the current frame. */ VP9D_GET_DISPLAY_SIZE, + /** control function to get the bit depth of the stream. */ + VP9D_GET_BIT_DEPTH, + /** For testing. */ VP9_INVERT_TILE_DECODE_ORDER, @@ -118,6 +121,7 @@ VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) +VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) /*! @} - end defgroup vp8_decoder */ diff --git a/source/libvpx/vpx/vpx_codec.h b/source/libvpx/vpx/vpx_codec.h index 07df72a..b25308e 100644 --- a/source/libvpx/vpx/vpx_codec.h +++ b/source/libvpx/vpx/vpx_codec.h @@ -203,9 +203,11 @@ extern "C" { const char *err_detail; /**< Detailed info, if available */ vpx_codec_flags_t init_flags; /**< Flags passed at init time */ union { - struct vpx_codec_dec_cfg *dec; /**< Decoder Configuration Pointer */ - struct vpx_codec_enc_cfg *enc; /**< Encoder Configuration Pointer */ - void *raw; + /**< Decoder Configuration Pointer */ + const struct vpx_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct vpx_codec_enc_cfg *enc; + const void *raw; } config; /**< Configuration pointer aliasing union */ vpx_codec_priv_t *priv; /**< Algorithm private storage */ } vpx_codec_ctx_t; @@ -215,9 +217,9 @@ extern "C" { * This enumeration determines the bit depth of the codec. */ typedef enum vpx_bit_depth { - VPX_BITS_8, /**< 8 bits */ - VPX_BITS_10, /**< 10 bits */ - VPX_BITS_12 /**< 12 bits */ + VPX_BITS_8 = 8, /**< 8 bits */ + VPX_BITS_10 = 10, /**< 10 bits */ + VPX_BITS_12 = 12, /**< 12 bits */ } vpx_bit_depth_t; /* diff --git a/source/libvpx/vpx/vpx_decoder.h b/source/libvpx/vpx/vpx_decoder.h index 10b89fa..62fd919 100644 --- a/source/libvpx/vpx/vpx_decoder.h +++ b/source/libvpx/vpx/vpx_decoder.h @@ -135,7 +135,7 @@ extern "C" { */ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_dec_cfg_t *cfg, + const vpx_codec_dec_cfg_t *cfg, vpx_codec_flags_t flags, int ver); diff --git a/source/libvpx/vpx/vpx_encoder.h b/source/libvpx/vpx/vpx_encoder.h index 7dbbf2f..fdabed1 100644 --- a/source/libvpx/vpx/vpx_encoder.h +++ b/source/libvpx/vpx/vpx_encoder.h @@ -80,6 +80,9 @@ extern "C" { */ #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 +/*! Can support input images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 /*! \brief Initialization-time Feature Enabling * @@ -91,6 +94,7 @@ extern "C" { #define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ #define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one partition at a time. */ +#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ /*!\brief Generic fixed size buffer structure @@ -188,14 +192,14 @@ extern "C" { has id 0.*/ } frame; /**< data for compressed frame packet */ - struct vpx_fixed_buf twopass_stats; /**< data for two-pass packet */ - struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ struct vpx_psnr_pkt { unsigned int samples[4]; /**< Number of samples, total/y/u/v */ uint64_t sse[4]; /**< sum squared error, total/y/u/v */ double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ - struct vpx_fixed_buf raw; /**< data for arbitrary packets */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ #if CONFIG_SPATIAL_SVC size_t layer_sizes[VPX_SS_MAX_LAYERS]; #endif @@ -324,6 +328,21 @@ extern "C" { */ unsigned int g_h; + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * vpx_bit_depth_t enum. + */ + vpx_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; /*!\brief Stream timebase units * @@ -452,14 +471,14 @@ extern "C" { * A buffer containing all of the stats packets produced in the first * pass, concatenated. */ - struct vpx_fixed_buf rc_twopass_stats_in; + vpx_fixed_buf_t rc_twopass_stats_in; /*!\brief first pass mb stats buffer. * * A buffer containing all of the first pass mb stats packets produced * in the first pass, concatenated. */ - struct vpx_fixed_buf rc_firstpass_mb_stats_in; + vpx_fixed_buf_t rc_firstpass_mb_stats_in; /*!\brief Target data rate * @@ -715,7 +734,7 @@ extern "C" { */ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, + const vpx_codec_enc_cfg_t *cfg, vpx_codec_flags_t flags, int ver); diff --git a/source/libvpx/vpx/vpx_image.h b/source/libvpx/vpx/vpx_image.h index 7b04b70..0b7bb90 100644 --- a/source/libvpx/vpx/vpx_image.h +++ b/source/libvpx/vpx/vpx_image.h @@ -31,10 +31,10 @@ extern "C" { #define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/ -#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format */ -#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U plane in memory */ -#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel component */ -#define VPX_IMG_FMT_HIGH 0x800 /**< Image uses 16bit framebuffer */ +#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ +#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ /*!\brief List of supported image formats */ typedef enum vpx_img_fmt { @@ -59,9 +59,9 @@ extern "C" { VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7, - VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH, - VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH, - VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH + VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT diff --git a/source/libvpx/vpx_mem/vpx_mem.c b/source/libvpx/vpx_mem/vpx_mem.c index 059248b..da61642 100644 --- a/source/libvpx/vpx_mem/vpx_mem.c +++ b/source/libvpx/vpx_mem/vpx_mem.c @@ -16,6 +16,7 @@ #include <stdlib.h> #include <string.h> #include "include/vpx_mem_intrnl.h" +#include "vpx/vpx_integer.h" #if CONFIG_MEM_TRACKER #ifndef VPX_NO_GLOBALS @@ -452,6 +453,29 @@ void *vpx_memset(void *dest, int val, size_t length) { return VPX_MEMSET_L(dest, val, length); } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +void *vpx_memset16(void *dest, int val, size_t length) { +#if CONFIG_MEM_CHECKS + if ((int)dest < 0x4000) { + _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", + (int)dest, val, length);) + +#if defined(VXWORKS) + sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0); + + vx_sleep(10000); +#endif + } +#endif + int i; + void *orig = dest; + uint16_t *dest16 = dest; + for (i = 0; i < length; i++) + *dest16++ = val; + return orig; +} +#endif // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + void *vpx_memmove(void *dest, const void *src, size_t count) { #if CONFIG_MEM_CHECKS diff --git a/source/libvpx/vpx_mem/vpx_mem.h b/source/libvpx/vpx_mem/vpx_mem.h index 33686b2..e2391f4 100644 --- a/source/libvpx/vpx_mem/vpx_mem.h +++ b/source/libvpx/vpx_mem/vpx_mem.h @@ -73,6 +73,9 @@ extern "C" { void *vpx_memcpy(void *dest, const void *src, size_t length); void *vpx_memset(void *dest, int val, size_t length); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + void *vpx_memset16(void *dest, int val, size_t length); +#endif void *vpx_memmove(void *dest, const void *src, size_t count); /* special memory functions */ diff --git a/source/libvpx/vpx_scale/generic/yv12config.c b/source/libvpx/vpx_scale/generic/yv12config.c index 827bce7..70d7ac0 100644 --- a/source/libvpx/vpx_scale/generic/yv12config.c +++ b/source/libvpx/vpx_scale/generic/yv12config.c @@ -13,6 +13,9 @@ #include "./vpx_config.h" #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif /**************************************************************************** * Exports @@ -136,7 +139,11 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, - int ss_x, int ss_y, int border, + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, vpx_codec_frame_buffer_t *fb, vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) { @@ -161,11 +168,21 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, const int alpha_border_h = border; const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) * (uint64_t)alpha_stride; +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = (1 + use_highbitdepth) * + (yplane_size + 2 * uvplane_size + alpha_plane_size); +#else const uint64_t frame_size = yplane_size + 2 * uvplane_size + alpha_plane_size; +#endif // CONFIG_VP9_HIGHBITDEPTH +#else +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); #else const uint64_t frame_size = yplane_size + 2 * uvplane_size; -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_ALPHA if (cb != NULL) { const int align_addr_extra_size = 31; const uint64_t external_frame_size = frame_size + align_addr_extra_size; @@ -231,11 +248,31 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->border = border; ybf->frame_size = (int)frame_size; +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->y_buffer = p + (border * y_stride) + border; + ybf->u_buffer = p + yplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->v_buffer = p + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->flags = 0; + } +#else ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; ybf->u_buffer = ybf->buffer_alloc + yplane_size + (uv_border_h * uv_stride) + uv_border_w; ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (uv_border_h * uv_stride) + uv_border_w; +#endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_ALPHA ybf->alpha_width = alpha_width; @@ -252,11 +289,18 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, - int ss_x, int ss_y, int border) { + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border) { if (ybf) { vp9_free_frame_buffer(ybf); - return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border, - NULL, NULL, NULL); + return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, NULL, NULL, NULL); } return -2; } diff --git a/source/libvpx/vpx_scale/generic/yv12extend.c b/source/libvpx/vpx_scale/generic/yv12extend.c index 036a505..0485452 100644 --- a/source/libvpx/vpx_scale/generic/yv12extend.c +++ b/source/libvpx/vpx_scale/generic/yv12extend.c @@ -13,6 +13,9 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif static void extend_plane(uint8_t *const src, int src_stride, int width, int height, @@ -55,6 +58,50 @@ static void extend_plane(uint8_t *const src, int src_stride, } } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src; + uint16_t *src_ptr2 = src + width - 1; + uint16_t *dst_ptr1 = src - extend_left; + uint16_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += src_stride; + } +} +#endif + void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { const int uv_border = ybf->border / 2; @@ -64,6 +111,31 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high( + ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane_high( + ybf->u_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + + extend_plane_high( + ybf->v_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + return; + } +#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ybf->border, ybf->border, @@ -99,6 +171,20 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + extend_plane_high(ybf->u_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + extend_plane_high(ybf->v_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + return; + } +#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ext_size, ext_size, @@ -121,6 +207,14 @@ void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) { VP9INNERBORDERINPIXELS : ybf->border; extend_frame(ybf, inner_bw); } + +#if CONFIG_VP9_HIGHBITDEPTH +void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + vpx_memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9 // Copies the source image into the destination image and updates the @@ -140,6 +234,40 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, assert(src_ybc->y_height == dst_ybc->y_height); #endif +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); + return; + } else { + assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); + } +#endif + for (row = 0; row < src_ybc->y_height; ++row) { vpx_memcpy(dst, src, src_ybc->y_width); src += src_ybc->y_stride; @@ -173,6 +301,19 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + for (row = 0; row < src_ybc->y_height; ++row) { vpx_memcpy(dst, src, src_ybc->y_width); src += src_ybc->y_stride; diff --git a/source/libvpx/vpx_scale/yv12config.h b/source/libvpx/vpx_scale/yv12config.h index cdde75c..eb0a8d6 100644 --- a/source/libvpx/vpx_scale/yv12config.h +++ b/source/libvpx/vpx_scale/yv12config.h @@ -55,6 +55,8 @@ typedef struct yv12_buffer_config { int flags; } YV12_BUFFER_CONFIG; +#define YV12_FLAG_HIGHBITDEPTH 1 + int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border); int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, @@ -63,6 +65,9 @@ int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif int border); // Updates the yv12 buffer config with the frame buffer. If cb is not @@ -73,6 +78,9 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, // on failure. int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif int border, vpx_codec_frame_buffer_t *fb, vpx_get_frame_buffer_cb_fn_t cb, diff --git a/source/libvpx/vpxdec.c b/source/libvpx/vpxdec.c index 6c822ab..6470081 100644 --- a/source/libvpx/vpxdec.c +++ b/source/libvpx/vpxdec.c @@ -90,12 +90,20 @@ static const arg_def_t fb_arg = static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static const arg_def_t outbitdeptharg = ARG_DEF( + NULL, "output-bit-depth", 1, + "Output bit-depth for decoded frames"); +#endif static const arg_def_t *all_args[] = { &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile, &threadsarg, &verbosearg, &scalearg, &fb_arg, &md5arg, &error_concealment, &continuearg, +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + &outbitdeptharg, +#endif NULL }; @@ -129,6 +137,26 @@ static const arg_def_t *vp8_pp_args[] = { #if CONFIG_LIBYUV static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst, FilterModeEnum mode) { +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (src->fmt == VPX_IMG_FMT_I42016) { + assert(dst->fmt == VPX_IMG_FMT_I42016); + return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y], + src->stride[VPX_PLANE_Y]/2, + (uint16_t*)src->planes[VPX_PLANE_U], + src->stride[VPX_PLANE_U]/2, + (uint16_t*)src->planes[VPX_PLANE_V], + src->stride[VPX_PLANE_V]/2, + src->d_w, src->d_h, + (uint16_t*)dst->planes[VPX_PLANE_Y], + dst->stride[VPX_PLANE_Y]/2, + (uint16_t*)dst->planes[VPX_PLANE_U], + dst->stride[VPX_PLANE_U]/2, + (uint16_t*)dst->planes[VPX_PLANE_V], + dst->stride[VPX_PLANE_V]/2, + dst->d_w, dst->d_h, + mode); + } +#endif assert(src->fmt == VPX_IMG_FMT_I420); assert(dst->fmt == VPX_IMG_FMT_I420); return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y], @@ -265,6 +293,11 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3], static void write_image_file(const vpx_image_t *img, const int planes[3], FILE *file) { int i, y; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); +#else + const int bytes_per_sample = 1; +#endif for (i = 0; i < 3; ++i) { const int plane = planes[i]; @@ -274,7 +307,7 @@ static void write_image_file(const vpx_image_t *img, const int planes[3], const int h = vpx_img_plane_height(img, plane); for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytes_per_sample, w, file); buf += stride; } } @@ -494,6 +527,178 @@ static FILE *open_outfile(const char *name) { } } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt || input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = (uint16_t *)(src->planes[plane] + + y * src->stride[plane]); + uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + + y * dst->stride[plane]); + for (x = 0; x < w; x++) + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } +} + +static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || + input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + + y * dst->stride[plane]); + for (x = 0; x < w; x++) { + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } + } +} + +static void img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + high_img_upshift(dst, src, input_shift); + } else { + low_img_upshift(dst, src, input_shift); + } +} + +static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt || down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = (uint16_t *)(src->planes[plane] + + y * src->stride[plane]); + uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + + y * dst->stride[plane]); + for (x = 0; x < w; x++) + *p_dst++ = *p_src++ >> down_shift; + } + } +} + +static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH || + down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = (uint16_t *)(src->planes[plane] + + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = *p_src++ >> down_shift; + } + } + } +} + +static void img_downshift(vpx_image_t *dst, vpx_image_t *src, + int down_shift) { + if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + high_img_downshift(dst, src, down_shift); + } else { + low_img_downshift(dst, src, down_shift); + } +} +#endif + int main_loop(int argc, const char **argv_) { vpx_codec_ctx_t decoder; char *fn = NULL; @@ -518,6 +723,9 @@ int main_loop(int argc, const char **argv_) { int opt_yv12 = 0; int opt_i420 = 0; vpx_codec_dec_cfg_t cfg = {0, 0, 0}; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + int output_bit_depth = 0; +#endif #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = {0}; int vp8_dbg_color_ref_frame = 0; @@ -529,6 +737,9 @@ int main_loop(int argc, const char **argv_) { int dec_flags = 0; int do_scale = 0; vpx_image_t *scaled_img = NULL; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + vpx_image_t *img_shifted = NULL; +#endif int frame_avail, got_data; int num_external_frame_buffers = 0; struct ExternalFrameBufferList ext_fb_list = {0, NULL}; @@ -569,6 +780,9 @@ int main_loop(int argc, const char **argv_) { use_y4m = 0; flipuv = 1; opt_yv12 = 1; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + output_bit_depth = 8; // For yv12 8-bit depth output is assumed +#endif } else if (arg_match(&arg, &use_i420, argi)) { use_y4m = 0; flipuv = 0; @@ -599,7 +813,13 @@ int main_loop(int argc, const char **argv_) { do_scale = 1; else if (arg_match(&arg, &fb_arg, argi)) num_external_frame_buffers = arg_parse_uint(&arg); - + else if (arg_match(&arg, &continuearg, argi)) + keep_going = 1; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + else if (arg_match(&arg, &outbitdeptharg, argi)) { + output_bit_depth = arg_parse_uint(&arg); + } +#endif #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { postproc = 1; @@ -649,11 +869,8 @@ int main_loop(int argc, const char **argv_) { } } else if (arg_match(&arg, &error_concealment, argi)) { ec_enabled = 1; - } else if (arg_match(&arg, &continuearg, argi)) { - keep_going = 1; } - -#endif +#endif // CONFIG_VP8_DECODER else argj++; } @@ -889,7 +1106,7 @@ int main_loop(int argc, const char **argv_) { display_height = display_size[1]; } } - scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, display_width, + scaled_img = vpx_img_alloc(NULL, img->fmt, display_width, display_height, 16); scaled_img->bit_depth = img->bit_depth; } @@ -907,6 +1124,33 @@ int main_loop(int argc, const char **argv_) { #endif } } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + // Default to codec bit depth if output bit depth not set + if (!output_bit_depth) { + output_bit_depth = img->bit_depth; + } + // Shift up or down if necessary + if (output_bit_depth != img->bit_depth) { + if (!img_shifted) { + if (output_bit_depth == 8) { + img_shifted = vpx_img_alloc( + NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH, + img->d_w, img->d_h, 16); + } else { + img_shifted = vpx_img_alloc( + NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH, + img->d_w, img->d_h, 16); + } + img_shifted->bit_depth = output_bit_depth; + } + if (output_bit_depth > img->bit_depth) { + img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); + } else { + img_downshift(img_shifted, img, img->bit_depth - output_bit_depth); + } + img = img_shifted; + } +#endif if (single_file) { if (use_y4m) { @@ -1013,6 +1257,9 @@ fail: free(buf); if (scaled_img) vpx_img_free(scaled_img); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (img_shifted) vpx_img_free(img_shifted); +#endif for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { free(ext_fb_list.ext_fb[i].data); diff --git a/source/libvpx/vpxenc.c b/source/libvpx/vpxenc.c index b99e61a..5afca24 100644 --- a/source/libvpx/vpxenc.c +++ b/source/libvpx/vpxenc.c @@ -200,6 +200,10 @@ static const arg_def_t experimental_bitstream = ARG_DEF(NULL, "experimental-bitstream", 0, "Allow experimental bitstream features."); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static const arg_def_t test16bitinternalarg = ARG_DEF( + NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); +#endif static const arg_def_t *main_args[] = { &debugmode, @@ -248,6 +252,9 @@ static const arg_def_t *global_args[] = { #endif &timebase, &framerate, &error_resilient, +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + &test16bitinternalarg, +#endif &lag_in_frames, NULL }; @@ -321,7 +328,7 @@ static const arg_def_t *kf_args[] = { static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1, - "Filter sharpness (0-7)"); + "Loop filter sharpness (0..7)"); static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1, @@ -329,11 +336,11 @@ static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1, static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1, - "AltRef Max Frames"); + "AltRef max frames (0..15)"); static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1, - "AltRef Strength"); + "AltRef filter strength (0..6)"); static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, - "AltRef Type"); + "AltRef type"); static const struct arg_enum_list tuning_enum[] = { {"psnr", VP8_TUNE_PSNR}, {"ssim", VP8_TUNE_SSIM}, @@ -378,9 +385,26 @@ static const arg_def_t aq_mode = ARG_DEF( "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, " "3: cyclic refresh)"); static const arg_def_t frame_periodic_boost = ARG_DEF( - NULL, "frame_boost", 1, + NULL, "frame-boost", 1, "Enable frame periodic boost (0: off (default), 1: on)"); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static const struct arg_enum_list bitdepth_enum[] = { + {"8", VPX_BITS_8}, + {"10", VPX_BITS_10}, + {"12", VPX_BITS_12}, + {NULL, 0} +}; + +static const arg_def_t bitdeptharg = ARG_DEF_ENUM("b", "bit-depth", 1, + "Bit depth for codec " + "(8 for version <=1, " + "10 or 12 for version 2)", + bitdepth_enum); +static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, + "Bit depth of input"); +#endif + static const struct arg_enum_list tune_content_enum[] = { {"default", VP9E_CONTENT_DEFAULT}, {"screen", VP9E_CONTENT_SCREEN}, @@ -395,6 +419,9 @@ static const arg_def_t *vp9_args[] = { &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless, &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content, +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + &bitdeptharg, &inbitdeptharg, +#endif NULL }; static const int vp9_arg_ctrl_map[] = { @@ -450,6 +477,102 @@ void usage_exit() { } #define mmin(a, b) ((a) < (b) ? (a) : (b)) + +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]) { + uint16_t *plane1, *plane2; + uint32_t stride1, stride2; + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y]; + plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y]; + stride1 = img1->stride[VPX_PLANE_Y]/2; + stride2 = img2->stride[VPX_PLANE_Y]/2; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(plane1 + (i + k) * stride1 + j + l); + yloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + plane1 = (uint16_t*)img1->planes[VPX_PLANE_U]; + plane2 = (uint16_t*)img2->planes[VPX_PLANE_U]; + stride1 = img1->stride[VPX_PLANE_U]/2; + stride2 = img2->stride[VPX_PLANE_U]/2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(plane1 + (i + k) * stride1 + j + l); + uloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + plane1 = (uint16_t*)img1->planes[VPX_PLANE_V]; + plane2 = (uint16_t*)img2->planes[VPX_PLANE_V]; + stride1 = img1->stride[VPX_PLANE_V]/2; + stride2 = img2->stride[VPX_PLANE_V]/2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(plane1 + (i + k) * stride1 + j + l); + vloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } +} +#endif + static void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, int yloc[4], int uloc[4], int vloc[4]) { @@ -542,7 +665,8 @@ static void find_mismatch(const vpx_image_t *const img1, static int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { - const uint32_t c_w = + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; const uint32_t c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; @@ -552,11 +676,17 @@ static int compare_img(const vpx_image_t *const img1, match &= (img1->fmt == img2->fmt); match &= (img1->d_w == img2->d_w); match &= (img1->d_h == img2->d_h); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } +#endif for (i = 0; i < img1->d_h; ++i) match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - img1->d_w) == 0); + l_w) == 0); for (i = 0; i < c_h; ++i) match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], @@ -601,6 +731,10 @@ struct stream_config { int arg_ctrl_cnt; int write_webm; int have_kf_max_dist; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + // whether to use 16bit internal buffers + int use_16bit_internal; +#endif }; @@ -740,8 +874,9 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { #if CONFIG_VP9_ENCODER // Make default VP9 passes = 2 until there is a better quality 1-pass // encoder - global->passes = (strcmp(global->codec->name, "vp9") == 0 && - global->deadline != VPX_DL_REALTIME) ? 2 : 1; + if (global->codec != NULL && global->codec->name != NULL) + global->passes = (strcmp(global->codec->name, "vp9") == 0 && + global->deadline != VPX_DL_REALTIME) ? 2 : 1; #else global->passes = 1; #endif @@ -809,8 +944,10 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, struct stream_state *stream; stream = calloc(1, sizeof(*stream)); - if (!stream) + if (stream == NULL) { fatal("Failed to allocate new stream."); + } + if (prev) { memcpy(stream, prev, sizeof(*stream)); stream->index++; @@ -870,6 +1007,9 @@ static int parse_stream_params(struct VpxEncoderConfig *global, static const int *ctrl_args_map = NULL; struct stream_config *config = &stream->config; int eos_mark_found = 0; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + int test_16bit_internal = 0; +#endif // Handle codec specific options if (0) { @@ -918,6 +1058,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.g_w = arg_parse_uint(&arg); } else if (arg_match(&arg, &height, argi)) { config->cfg.g_h = arg_parse_uint(&arg); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &bitdeptharg, argi)) { + config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &inbitdeptharg, argi)) { + config->cfg.g_input_bit_depth = arg_parse_uint(&arg); +#endif #if CONFIG_WEBM_IO } else if (arg_match(&arg, &stereo_mode, argi)) { config->stereo_fmt = arg_parse_enum_or_int(&arg); @@ -985,6 +1131,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->have_kf_max_dist = 1; } else if (arg_match(&arg, &kf_disabled, argi)) { config->cfg.kf_mode = VPX_KF_DISABLED; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &test16bitinternalarg, argi)) { + if (strcmp(global->codec->name, "vp9") == 0) { + test_16bit_internal = 1; + } +#endif } else { int i, match = 0; for (i = 0; ctrl_args[i]; i++) { @@ -996,12 +1148,13 @@ static int parse_stream_params(struct VpxEncoderConfig *global, * instance of this control. */ for (j = 0; j < config->arg_ctrl_cnt; j++) - if (config->arg_ctrls[j][0] == ctrl_args_map[i]) + if (ctrl_args_map != NULL && + config->arg_ctrls[j][0] == ctrl_args_map[i]) break; /* Update/insert */ assert(j < (int)ARG_CTRL_CNT_MAX); - if (j < (int)ARG_CTRL_CNT_MAX) { + if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) { config->arg_ctrls[j][0] = ctrl_args_map[i]; config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg); if (j == config->arg_ctrl_cnt) @@ -1014,6 +1167,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global, argj++; } } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (strcmp(global->codec->name, "vp9") == 0) { + config->use_16bit_internal = test_16bit_internal | + (config->cfg.g_profile > 1); + } +#endif return eos_mark_found; } @@ -1041,6 +1200,14 @@ static void validate_stream_config(const struct stream_state *stream, experimental_bitstream.long_name); } + // Check that the codec bit depth is greater than the input bit depth. + if (stream->config.cfg.g_input_bit_depth > + (unsigned int)stream->config.cfg.g_bit_depth) { + fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)", + stream->index, (int)stream->config.cfg.g_bit_depth, + stream->config.cfg.g_input_bit_depth); + } + for (streami = stream; streami; streami = streami->next) { /* All streams require output files */ if (!streami->config.out_fn) @@ -1149,6 +1316,8 @@ static void show_stream_config(struct stream_state *stream, SHOW(g_profile); SHOW(g_w); SHOW(g_h); + SHOW(g_bit_depth); + SHOW(g_input_bit_depth); SHOW(g_timebase.num); SHOW(g_timebase.den); SHOW(g_error_resilient); @@ -1281,6 +1450,9 @@ static void initialize_encoder(struct stream_state *stream, flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0; flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0; +#endif /* Construct Encoder Context */ vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(), @@ -1326,6 +1498,46 @@ static void encode_frame(struct stream_state *stream, / cfg->g_timebase.num / global->framerate.num; /* Scale if necessary */ +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (img) { + if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) && + (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != VPX_IMG_FMT_I42016) { + fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) { + stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, + cfg->g_w, cfg->g_h, 16); + } + I420Scale_16((uint16*)img->planes[VPX_PLANE_Y], + img->stride[VPX_PLANE_Y]/2, + (uint16*)img->planes[VPX_PLANE_U], + img->stride[VPX_PLANE_U]/2, + (uint16*)img->planes[VPX_PLANE_V], + img->stride[VPX_PLANE_V]/2, + img->d_w, img->d_h, + (uint16*)stream->img->planes[VPX_PLANE_Y], + stream->img->stride[VPX_PLANE_Y]/2, + (uint16*)stream->img->planes[VPX_PLANE_U], + stream->img->stride[VPX_PLANE_U]/2, + (uint16*)stream->img->planes[VPX_PLANE_V], + stream->img->stride[VPX_PLANE_V]/2, + stream->img->d_w, stream->img->d_h, + kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif + } + } +#endif if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) { fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); @@ -1504,6 +1716,131 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) { return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + // Note the offset is 1 less than half + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->w != src->w || dst->h != src->h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt || input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->w; + int h = src->h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = (uint16_t *)(src->planes[plane] + + y * src->stride[plane]); + uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + + y * dst->stride[plane]); + for (x = 0; x < w; x++) + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } +} + +static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + // Note the offset is 1 less than half + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->w != src->w || dst->h != src->h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || + input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->w; + int h = src->h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + + y * dst->stride[plane]); + for (x = 0; x < w; x++) { + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } + } +} + +static void img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + high_img_upshift(dst, src, input_shift); + } else { + low_img_upshift(dst, src, input_shift); + } +} + +static void img_cast_16_to_8(vpx_image_t *dst, vpx_image_t *src) { + int plane; + if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt || + dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + break; + default: + fatal("Unsupported image conversion"); + break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w >>= src->x_chroma_shift; + h >>= src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = (uint16_t *)(src->planes[plane] + + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = *p_src++; + } + } + } +} +#endif static void test_decode(struct stream_state *stream, enum TestDecodeFatality fatal, @@ -1530,20 +1867,44 @@ static void test_decode(struct stream_state *stream, vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc); vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec); } else { - struct vp9_ref_frame ref; + struct vp9_ref_frame ref_enc, ref_dec; - ref.idx = 0; - vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref); - enc_img = ref.img; - vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref); - dec_img = ref.img; + ref_enc.idx = 0; + ref_dec.idx = 0; + vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc); + enc_img = ref_enc.img; + vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec); + dec_img = ref_dec.img; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + img_cast_16_to_8(&enc_img, &ref_enc.img); + } + if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + img_cast_16_to_8(&dec_img, &ref_dec.img); + } + } +#endif } ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); if (!compare_img(&enc_img, &dec_img)) { int y[4], u[4], v[4]; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else find_mismatch(&enc_img, &dec_img, y, u, v); +#endif stream->decoder.err = 1; warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, "Stream %d: Encode/decode mismatch on frame %d at" @@ -1585,6 +1946,12 @@ static void print_time(const char *label, int64_t etl) { int main(int argc, const char **argv_) { int pass; vpx_image_t raw; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + vpx_image_t raw_shift; + int allocated_raw_shift = 0; + int use_16bit_internal = 0; + int input_shift = 0; +#endif int frame_avail, got_data; struct VpxInputContext input; @@ -1686,6 +2053,27 @@ int main(int argc, const char **argv_) { if (!input.width || !input.height) fatal("Specify stream dimensions with --width (-w) " " and --height (-h)"); + + /* If input file does not specify bit-depth but input-bit-depth parameter + * exists, assume that to be the input bit-depth. However, if the + * input-bit-depth paramter does not exist, assume the input bit-depth + * to be the same as the codec bit-depth. + */ + if (!input.bit_depth) { + FOREACH_STREAM({ + if (stream->config.cfg.g_input_bit_depth) + input.bit_depth = stream->config.cfg.g_input_bit_depth; + else + input.bit_depth = stream->config.cfg.g_input_bit_depth = + (int)stream->config.cfg.g_bit_depth; + }); + if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH; + } else { + FOREACH_STREAM({ + stream->config.cfg.g_input_bit_depth = input.bit_depth; + }); + } + FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height)); FOREACH_STREAM(validate_stream_config(stream, &global)); @@ -1739,6 +2127,25 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(open_output_file(stream, &global)); FOREACH_STREAM(initialize_encoder(stream, &global)); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (strcmp(global.codec->name, "vp9") == 0) { + // Check to see if at least one stream uses 16 bit internal. + // Currently assume that the bit_depths for all streams using + // highbitdepth are the same. + FOREACH_STREAM({ + if (stream->config.use_16bit_internal) { + use_16bit_internal = 1; + } + if (stream->config.cfg.g_profile == 0) { + input_shift = 0; + } else { + input_shift = (int)stream->config.cfg.g_bit_depth - + stream->config.cfg.g_input_bit_depth; + } + }); + } +#endif + frame_avail = 1; got_data = 0; @@ -1776,10 +2183,45 @@ int main(int argc, const char **argv_) { frame_avail = 0; if (frames_in > global.skip_frames) { +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + vpx_image_t *frame_to_encode; + if (input_shift || (use_16bit_internal && input.bit_depth == 8)) { + assert(use_16bit_internal); + // Input bit depth and stream bit depth do not match, so up + // shift frame to stream bit depth + if (!allocated_raw_shift) { + vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH, + input.width, input.height, 32); + allocated_raw_shift = 1; + } + img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + vpx_usec_timer_start(&timer); + if (use_16bit_internal) { + assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH); + FOREACH_STREAM({ + if (stream->config.use_16bit_internal) + encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, + frames_in); + else + assert(0); + }); + } else { + assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0); + FOREACH_STREAM(encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, + frames_in)); + } +#else vpx_usec_timer_start(&timer); FOREACH_STREAM(encode_frame(stream, &global, frame_avail ? &raw : NULL, frames_in)); +#endif vpx_usec_timer_mark(&timer); cx_time += vpx_usec_timer_elapsed(&timer); @@ -1788,7 +2230,8 @@ int main(int argc, const char **argv_) { got_data = 0; FOREACH_STREAM(get_cx_data(stream, &global, &got_data)); - if (!got_data && input.length && !streams->frames_out) { + if (!got_data && input.length && streams != NULL && + !streams->frames_out) { lagged_count = global.limit ? seen_frames : ftello(input.file); } else if (input.length) { int64_t remaining; @@ -1896,6 +2339,10 @@ int main(int argc, const char **argv_) { }); #endif +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (allocated_raw_shift) + vpx_img_free(&raw_shift); +#endif vpx_img_free(&raw); free(argv); free(streams); diff --git a/source/libvpx/y4minput.c b/source/libvpx/y4minput.c index 520c332..bcc742a 100644 --- a/source/libvpx/y4minput.c +++ b/source/libvpx/y4minput.c @@ -700,7 +700,7 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, int only_420) { - char buffer[80]; + char buffer[80] = {0}; int ret; int i; /*Read until newline, or 80 cols, whichever happens first.*/ @@ -978,7 +978,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz); else _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz); - _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz); + + if (_y4m->aux_buf_sz > 0) + _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz); return 0; } |