aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2023-02-17 22:23:25 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2023-02-17 22:23:25 +0000
commit9274fdb00707b14d247c02e46a8e0209f7472ea4 (patch)
tree07ff61be1997ddb98e63873efbd5cf4ebc49b564
parent29c548cedf1ca2af028e80fd92fa79c374712b4c (diff)
parent3a1231643009c2997316005c442b03666cd398ce (diff)
downloadlibvpx-9274fdb00707b14d247c02e46a8e0209f7472ea4.tar.gz
Merge "Merge commit 'v1.13.0' into m/master" am: 4b7acd814f am: 3a12316430
Original change: https://android-review.googlesource.com/c/platform/external/libvpx/+/2435878 Change-Id: I397a6baf6036abc6a9e94f63731af845036cc5d8 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.clang-format142
-rw-r--r--.gitignore2
-rw-r--r--.mailmap3
-rw-r--r--AUTHORS4
-rw-r--r--Android.bp27
-rw-r--r--CHANGELOG44
-rw-r--r--README.android8
-rw-r--r--README.version4
-rw-r--r--build/make/Android.mk3
-rw-r--r--build/make/Makefile14
-rw-r--r--build/make/configure.sh6
-rwxr-xr-xbuild/make/gen_asm_deps.sh2
-rwxr-xr-xbuild/make/rtcd.pl3
-rw-r--r--config/arm-neon/vp9_rtcd.h12
-rw-r--r--config/arm-neon/vpx_dsp_rtcd.h558
-rw-r--r--config/arm-neon/vpx_version.h8
-rw-r--r--config/arm64/vp9_rtcd.h12
-rw-r--r--config/arm64/vpx_dsp_rtcd.h558
-rw-r--r--config/arm64/vpx_version.h8
-rw-r--r--config/generic/vpx_version.h8
-rw-r--r--config/x86/vp9_rtcd.h6
-rw-r--r--config/x86/vpx_dsp_rtcd.h3
-rw-r--r--config/x86/vpx_version.h8
-rw-r--r--config/x86_64/vpx_dsp_rtcd.h3
-rw-r--r--config/x86_64/vpx_version.h8
-rwxr-xr-xconfigure13
-rw-r--r--examples/svc_encodeframe.c7
-rw-r--r--examples/vp9_spatial_svc_encoder.c4
-rw-r--r--libs.doxy_template40
-rw-r--r--libs.mk16
-rw-r--r--md5_utils.c4
-rw-r--r--test/acm_random.h16
-rw-r--r--test/android/Android.mk4
-rw-r--r--test/comp_avg_pred_test.cc171
-rw-r--r--test/dct16x16_test.cc12
-rw-r--r--test/dct_partial_test.cc10
-rw-r--r--test/dct_test.cc23
-rw-r--r--test/encode_api_test.cc6
-rw-r--r--test/encode_test_driver.cc11
-rw-r--r--test/encode_test_driver.h10
-rw-r--r--test/error_resilience_test.cc2
-rw-r--r--test/frame_size_tests.cc6
-rw-r--r--test/hadamard_test.cc3
-rw-r--r--test/md5_helper.h2
-rw-r--r--test/pp_filter_test.cc13
-rw-r--r--test/resize_test.cc30
-rw-r--r--test/sad_test.cc230
-rw-r--r--test/svc_datarate_test.cc196
-rw-r--r--test/test.mk1
-rwxr-xr-xtest/tools_common.sh4
-rw-r--r--test/variance_test.cc295
-rw-r--r--test/vp8_datarate_test.cc2
-rw-r--r--test/vp8_ratectrl_rtc_test.cc3
-rw-r--r--test/vp9_datarate_test.cc157
-rw-r--r--test/vp9_ext_ratectrl_test.cc798
-rw-r--r--test/vp9_quantize_test.cc309
-rw-r--r--test/vp9_ratectrl_rtc_test.cc269
-rw-r--r--test/vp9_subtract_test.cc161
-rw-r--r--third_party/googletest/README.libvpx10
-rw-r--r--third_party/googletest/src/.clang-format4
-rw-r--r--third_party/googletest/src/CONTRIBUTORS2
-rw-r--r--third_party/googletest/src/README.md8
-rw-r--r--third_party/googletest/src/include/gtest/gtest-assertion-result.h237
-rw-r--r--third_party/googletest/src/include/gtest/gtest-death-test.h89
-rw-r--r--third_party/googletest/src/include/gtest/gtest-matchers.h76
-rw-r--r--third_party/googletest/src/include/gtest/gtest-message.h27
-rw-r--r--third_party/googletest/src/include/gtest/gtest-param-test.h87
-rw-r--r--third_party/googletest/src/include/gtest/gtest-printers.h65
-rw-r--r--third_party/googletest/src/include/gtest/gtest-spi.h132
-rw-r--r--third_party/googletest/src/include/gtest/gtest-test-part.h14
-rw-r--r--third_party/googletest/src/include/gtest/gtest-typed-test.h38
-rw-r--r--third_party/googletest/src/include/gtest/gtest.h532
-rw-r--r--third_party/googletest/src/include/gtest/gtest_pred_impl.h200
-rw-r--r--third_party/googletest/src/include/gtest/gtest_prod.h9
-rw-r--r--third_party/googletest/src/include/gtest/internal/custom/README.md12
-rw-r--r--third_party/googletest/src/include/gtest/internal/custom/gtest-port.h31
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h74
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-filepath.h17
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-internal.h330
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-param-util.h145
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-port-arch.h100
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-port.h982
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-string.h18
-rw-r--r--third_party/googletest/src/include/gtest/internal/gtest-type-util.h21
-rw-r--r--third_party/googletest/src/src/gtest-all.cc3
-rw-r--r--third_party/googletest/src/src/gtest-assertion-result.cc77
-rw-r--r--third_party/googletest/src/src/gtest-death-test.cc520
-rw-r--r--third_party/googletest/src/src/gtest-filepath.cc78
-rw-r--r--third_party/googletest/src/src/gtest-internal-inl.h217
-rw-r--r--third_party/googletest/src/src/gtest-matchers.cc5
-rw-r--r--third_party/googletest/src/src/gtest-port.cc349
-rw-r--r--third_party/googletest/src/src/gtest-printers.cc124
-rw-r--r--third_party/googletest/src/src/gtest-test-part.cc19
-rw-r--r--third_party/googletest/src/src/gtest-typed-test.cc7
-rw-r--r--third_party/googletest/src/src/gtest.cc1509
-rw-r--r--third_party/googletest/src/src/gtest_main.cc5
-rw-r--r--tools/3D-Reconstruction/MotionEST/Exhaust.py2
-rw-r--r--tools/3D-Reconstruction/MotionEST/GroundTruth.py4
-rw-r--r--tools/3D-Reconstruction/MotionEST/MotionEST.py4
-rw-r--r--tools/3D-Reconstruction/MotionEST/Util.py2
-rw-r--r--vp8/common/findnearmv.c4
-rw-r--r--vp8/common/mips/dspr2/filter_dspr2.c12
-rw-r--r--vp8/common/mips/msa/vp8_macros_msa.h24
-rw-r--r--vp8/decoder/dboolhuff.h4
-rw-r--r--vp8/decoder/decodemv.c4
-rw-r--r--vp8/decoder/onyxd_int.h4
-rw-r--r--vp8/encoder/bitstream.c5
-rw-r--r--vp8/encoder/encodemv.c10
-rw-r--r--vp8/encoder/firstpass.c31
-rw-r--r--vp8/encoder/loongarch/vp8_quantize_lsx.c (renamed from vp8/encoder/loongarch/quantize_lsx.c)0
-rw-r--r--vp8/encoder/mcomp.c29
-rw-r--r--vp8/encoder/onyx_if.c15
-rw-r--r--vp8/encoder/onyx_int.h8
-rw-r--r--vp8/encoder/rdopt.c2
-rw-r--r--vp8/encoder/x86/denoising_sse2.c2
-rw-r--r--vp8/encoder/x86/quantize_sse4.c5
-rw-r--r--vp8/vp8_dx_iface.c2
-rw-r--r--vp8/vp8_ratectrl_rtc.cc61
-rw-r--r--vp8/vp8cx.mk2
-rw-r--r--vp9/common/vp9_common.h2
-rw-r--r--vp9/common/vp9_loopfilter.c2
-rw-r--r--vp9/common/vp9_rtcd_defs.pl9
-rw-r--r--vp9/common/vp9_scan.c293
-rw-r--r--vp9/decoder/vp9_decodemv.c6
-rw-r--r--vp9/decoder/vp9_detokenize.c15
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c1265
-rw-r--r--vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c322
-rw-r--r--vp9/encoder/arm/neon/vp9_frame_scale_neon.c5
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c495
-rw-r--r--vp9/encoder/vp9_aq_cyclicrefresh.c4
-rw-r--r--vp9/encoder/vp9_bitstream.c6
-rw-r--r--vp9/encoder/vp9_denoiser.c7
-rw-r--r--vp9/encoder/vp9_encodeframe.c14
-rw-r--r--vp9/encoder/vp9_encoder.c155
-rw-r--r--vp9/encoder/vp9_encoder.h4
-rw-r--r--vp9/encoder/vp9_ext_ratectrl.c65
-rw-r--r--vp9/encoder/vp9_ext_ratectrl.h13
-rw-r--r--vp9/encoder/vp9_firstpass.c53
-rw-r--r--vp9/encoder/vp9_pickmode.c22
-rw-r--r--vp9/encoder/vp9_quantize.c28
-rw-r--r--vp9/encoder/vp9_quantize.h3
-rw-r--r--vp9/encoder/vp9_ratectrl.c55
-rw-r--r--vp9/encoder/vp9_ratectrl.h4
-rw-r--r--vp9/encoder/vp9_rd.c20
-rw-r--r--vp9/encoder/vp9_rdopt.c47
-rw-r--r--vp9/encoder/vp9_segmentation.c2
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c10
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h2
-rw-r--r--vp9/encoder/vp9_temporal_filter.c12
-rw-r--r--vp9/encoder/x86/highbd_temporal_filter_sse4.c135
-rw-r--r--vp9/encoder/x86/vp9_dct_intrin_sse2.c6
-rw-r--r--vp9/encoder/x86/vp9_diamond_search_sad_avx.c10
-rw-r--r--vp9/encoder/x86/vp9_frame_scale_ssse3.c4
-rw-r--r--vp9/encoder/x86/vp9_quantize_avx2.c456
-rw-r--r--vp9/encoder/x86/vp9_quantize_sse2.c230
-rw-r--r--vp9/encoder/x86/vp9_quantize_ssse3.c253
-rw-r--r--vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm178
-rw-r--r--vp9/ratectrl_rtc.cc16
-rw-r--r--vp9/ratectrl_rtc.h10
-rw-r--r--vp9/simple_encode.cc10
-rw-r--r--vp9/vp9_cx_iface.c35
-rw-r--r--vp9/vp9_dx_iface.c3
-rw-r--r--vp9/vp9cx.mk6
-rw-r--r--vpx/vp8cx.h12
-rw-r--r--vpx/vpx_encoder.h15
-rw-r--r--vpx/vpx_ext_ratectrl.h164
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.c81
-rw-r--r--vpx_dsp/arm/fdct16x16_neon.h587
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.c1574
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.h2919
-rw-r--r--vpx_dsp/arm/fdct4x4_neon.c (renamed from vpx_dsp/arm/fdct_neon.c)43
-rw-r--r--vpx_dsp/arm/fdct4x4_neon.h105
-rw-r--r--vpx_dsp/arm/fdct8x8_neon.c143
-rw-r--r--vpx_dsp/arm/fdct8x8_neon.h381
-rw-r--r--vpx_dsp/arm/fdct_neon.h602
-rw-r--r--vpx_dsp/arm/fdct_partial_neon.c65
-rw-r--r--vpx_dsp/arm/fwd_txfm_neon.c68
-rw-r--r--vpx_dsp/arm/hadamard_neon.c42
-rw-r--r--vpx_dsp/arm/highbd_quantize_neon.c307
-rw-r--r--vpx_dsp/arm/highbd_sad_neon.c225
-rw-r--r--vpx_dsp/arm/highbd_variance_neon.c496
-rw-r--r--vpx_dsp/arm/mem_neon.h165
-rw-r--r--vpx_dsp/arm/quantize_neon.c287
-rw-r--r--vpx_dsp/arm/sad4d_neon.c25
-rw-r--r--vpx_dsp/arm/sad_neon.c298
-rw-r--r--vpx_dsp/arm/subpel_variance_neon.c608
-rw-r--r--vpx_dsp/arm/subtract_neon.c56
-rw-r--r--vpx_dsp/arm/transpose_neon.h79
-rw-r--r--vpx_dsp/arm/variance_neon.c548
-rw-r--r--vpx_dsp/arm/vpx_convolve8_neon.c1101
-rw-r--r--vpx_dsp/arm/vpx_convolve8_neon.h206
-rw-r--r--vpx_dsp/arm/vpx_scaled_convolve8_neon.c10
-rw-r--r--vpx_dsp/avg.c3
-rw-r--r--vpx_dsp/bitwriter.h5
-rw-r--r--vpx_dsp/loongarch/quantize_lsx.c13
-rw-r--r--vpx_dsp/loopfilter.c12
-rw-r--r--vpx_dsp/mips/macros_msa.h46
-rw-r--r--vpx_dsp/ppc/quantize_vsx.c24
-rw-r--r--vpx_dsp/psnr.c67
-rw-r--r--vpx_dsp/variance.c6
-rw-r--r--vpx_dsp/vpx_dsp.mk14
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl347
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c4
-rw-r--r--vpx_dsp/x86/avg_intrin_sse2.c4
-rw-r--r--vpx_dsp/x86/convolve_avx2.h5
-rw-r--r--vpx_dsp/x86/fwd_dct32x32_impl_avx2.h2
-rw-r--r--vpx_dsp/x86/fwd_dct32x32_impl_sse2.h2
-rw-r--r--vpx_dsp/x86/highbd_inv_txfm_sse2.h2
-rw-r--r--vpx_dsp/x86/highbd_loopfilter_sse2.c8
-rw-r--r--vpx_dsp/x86/highbd_quantize_intrin_avx2.c258
-rw-r--r--vpx_dsp/x86/highbd_quantize_intrin_sse2.c13
-rw-r--r--vpx_dsp/x86/highbd_sad4d_avx2.c401
-rw-r--r--vpx_dsp/x86/highbd_sad_avx2.c468
-rw-r--r--vpx_dsp/x86/highbd_variance_sse2.c47
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.c4
-rw-r--r--vpx_dsp/x86/loopfilter_avx2.c4
-rw-r--r--vpx_dsp/x86/loopfilter_sse2.c14
-rw-r--r--vpx_dsp/x86/mem_sse2.h4
-rw-r--r--vpx_dsp/x86/post_proc_sse2.c2
-rw-r--r--vpx_dsp/x86/quantize_avx.c12
-rw-r--r--vpx_dsp/x86/quantize_avx2.c293
-rw-r--r--vpx_dsp/x86/quantize_sse2.c5
-rw-r--r--vpx_dsp/x86/quantize_sse2.h17
-rw-r--r--vpx_dsp/x86/quantize_ssse3.c11
-rw-r--r--vpx_dsp/x86/sad_avx2.c15
-rw-r--r--vpx_dsp/x86/subtract_avx2.c203
-rw-r--r--vpx_dsp/x86/sum_squares_sse2.c2
-rw-r--r--vpx_dsp/x86/variance_avx2.c17
-rw-r--r--vpx_dsp/x86/variance_sse2.c12
-rw-r--r--vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c6
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c34
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c6
-rw-r--r--vpx_ports/compiler_attributes.h12
-rw-r--r--vpxenc.c5
-rw-r--r--webmdec.h2
-rw-r--r--y4menc.c41
-rw-r--r--y4minput.c15
237 files changed, 19614 insertions, 9068 deletions
diff --git a/.clang-format b/.clang-format
index 866b7e211..a8bc4967c 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,149 +1,9 @@
---
Language: Cpp
-# BasedOnStyle: Google
-# Generated with clang-format 7.0.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle: Google
AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
- AfterClass: false
- AfterControlStatement: false
- AfterEnum: false
- AfterFunction: false
- AfterNamespace: false
- AfterObjCDeclaration: false
- AfterStruct: false
- AfterUnion: false
- AfterExternBlock: false
- BeforeCatch: false
- BeforeElse: false
- IndentBraces: false
- SplitEmptyFunction: true
- SplitEmptyRecord: true
- SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit: 80
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
- - foreach
- - Q_FOREACH
- - BOOST_FOREACH
-IncludeBlocks: Preserve
-IncludeCategories:
- - Regex: '^<ext/.*\.h>'
- Priority: 2
- - Regex: '^<.*\.h>'
- Priority: 1
- - Regex: '^<.*'
- Priority: 2
- - Regex: '.*'
- Priority: 3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth: 2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakTemplateDeclaration: 10
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Right
-RawStringFormats:
- - Language: Cpp
- Delimiters:
- - cc
- - CC
- - cpp
- - Cpp
- - CPP
- - 'c++'
- - 'C++'
- CanonicalDelimiter: ''
- BasedOnStyle: google
- - Language: TextProto
- Delimiters:
- - pb
- - PB
- - proto
- - PROTO
- EnclosingFunctions:
- - EqualsProto
- - EquivToProto
- - PARSE_PARTIAL_TEXT_PROTO
- - PARSE_TEST_PROTO
- - PARSE_TEXT_PROTO
- - ParseTextOrDie
- - ParseTextProtoOrDie
- CanonicalDelimiter: ''
- BasedOnStyle: google
-ReflowComments: true
SortIncludes: false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Auto
-TabWidth: 8
-UseTab: Never
-...
-
diff --git a/.gitignore b/.gitignore
index 5f2683538..99eeb92d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,8 +7,10 @@
*.o
*~
.cproject
+.idea
.project
.settings
+.vscode
/*-*.mk
/*.asm
/*.doxy
diff --git a/.mailmap b/.mailmap
index 376ca83ae..bb0ddd95b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -21,10 +21,11 @@ Jacky Chen <jackychen@google.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann <johann@duck.com> <johann.koenig@gmail.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
Marco Paniconi <marpan@google.com>
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Martin Storsjö <martin@martin.st>
diff --git a/AUTHORS b/AUTHORS
index fffda6336..2db4a113e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -21,8 +21,10 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Andrew Lewis <andrewlewis@google.com>
Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
Angie Chen <yunqi@google.com>
Angie Chiang <angiebird@google.com>
+Anton Venema <anton.venema@liveswitch.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Birk Magnussen <birk.magnussen@googlemail.com>
@@ -174,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rsbultje@gmail.com>
Rui Ueyama <ruiu@google.com>
Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
Sarah Parker <sarahparker@google.com>
Sasi Inguva <isasi@google.com>
Scott Graham <scottmg@chromium.org>
diff --git a/Android.bp b/Android.bp
index 5a914a935..952765b90 100644
--- a/Android.bp
+++ b/Android.bp
@@ -110,6 +110,7 @@ libvpx_arm_neon_c_srcs = [
"vp9/decoder/vp9_dsubexp.c",
"vp9/decoder/vp9_job_queue.c",
"vp9/encoder/arm/neon/vp9_dct_neon.c",
+ "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
"vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
"vp9/encoder/arm/neon/vp9_quantize_neon.c",
"vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -152,11 +153,11 @@ libvpx_arm_neon_c_srcs = [
"vpx/src/vpx_image.c",
"vpx_dsp/arm/avg_neon.c",
"vpx_dsp/arm/avg_pred_neon.c",
+ "vpx_dsp/arm/fdct4x4_neon.c",
+ "vpx_dsp/arm/fdct8x8_neon.c",
"vpx_dsp/arm/fdct16x16_neon.c",
"vpx_dsp/arm/fdct32x32_neon.c",
- "vpx_dsp/arm/fdct_neon.c",
"vpx_dsp/arm/fdct_partial_neon.c",
- "vpx_dsp/arm/fwd_txfm_neon.c",
"vpx_dsp/arm/hadamard_neon.c",
"vpx_dsp/arm/highbd_idct4x4_add_neon.c",
"vpx_dsp/arm/highbd_idct8x8_add_neon.c",
@@ -167,6 +168,9 @@ libvpx_arm_neon_c_srcs = [
"vpx_dsp/arm/highbd_idct32x32_add_neon.c",
"vpx_dsp/arm/highbd_intrapred_neon.c",
"vpx_dsp/arm/highbd_loopfilter_neon.c",
+ "vpx_dsp/arm/highbd_quantize_neon.c",
+ "vpx_dsp/arm/highbd_sad_neon.c",
+ "vpx_dsp/arm/highbd_variance_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
@@ -350,6 +354,7 @@ libvpx_arm64_c_srcs = [
"vp9/decoder/vp9_dsubexp.c",
"vp9/decoder/vp9_job_queue.c",
"vp9/encoder/arm/neon/vp9_dct_neon.c",
+ "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
"vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
"vp9/encoder/arm/neon/vp9_quantize_neon.c",
"vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -392,11 +397,11 @@ libvpx_arm64_c_srcs = [
"vpx/src/vpx_image.c",
"vpx_dsp/arm/avg_neon.c",
"vpx_dsp/arm/avg_pred_neon.c",
+ "vpx_dsp/arm/fdct4x4_neon.c",
+ "vpx_dsp/arm/fdct8x8_neon.c",
"vpx_dsp/arm/fdct16x16_neon.c",
"vpx_dsp/arm/fdct32x32_neon.c",
- "vpx_dsp/arm/fdct_neon.c",
"vpx_dsp/arm/fdct_partial_neon.c",
- "vpx_dsp/arm/fwd_txfm_neon.c",
"vpx_dsp/arm/hadamard_neon.c",
"vpx_dsp/arm/highbd_idct4x4_add_neon.c",
"vpx_dsp/arm/highbd_idct8x8_add_neon.c",
@@ -407,6 +412,9 @@ libvpx_arm64_c_srcs = [
"vpx_dsp/arm/highbd_idct32x32_add_neon.c",
"vpx_dsp/arm/highbd_intrapred_neon.c",
"vpx_dsp/arm/highbd_loopfilter_neon.c",
+ "vpx_dsp/arm/highbd_quantize_neon.c",
+ "vpx_dsp/arm/highbd_sad_neon.c",
+ "vpx_dsp/arm/highbd_variance_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
"vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
@@ -747,6 +755,7 @@ libvpx_x86_c_srcs = [
"vp9/encoder/x86/vp9_frame_scale_ssse3.c",
"vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"vp9/encoder/x86/vp9_quantize_sse2.c",
+ "vp9/encoder/x86/vp9_quantize_ssse3.c",
"vp9/vp9_cx_iface.c",
"vp9/vp9_dx_iface.c",
"vp9/vp9_iface_common.c",
@@ -982,6 +991,7 @@ libvpx_x86_64_c_srcs = [
"vp9/encoder/x86/vp9_frame_scale_ssse3.c",
"vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"vp9/encoder/x86/vp9_quantize_sse2.c",
+ "vp9/encoder/x86/vp9_quantize_ssse3.c",
"vp9/vp9_cx_iface.c",
"vp9/vp9_dx_iface.c",
"vp9/vp9_iface_common.c",
@@ -1062,7 +1072,6 @@ libvpx_x86_64_asm_srcs = [
"vp8/encoder/x86/fwalsh_sse2.asm",
"vp9/encoder/x86/vp9_dct_sse2.asm",
"vp9/encoder/x86/vp9_error_sse2.asm",
- "vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
"vpx_dsp/x86/add_noise_sse2.asm",
"vpx_dsp/x86/avg_ssse3_x86_64.asm",
"vpx_dsp/x86/deblock_sse2.asm",
@@ -1230,10 +1239,6 @@ cc_fuzz {
local_include_dirs: ["config/arm64"],
},
- riscv64: {
- local_include_dirs: ["config/generic"],
- },
-
x86: {
local_include_dirs: ["config/x86"],
},
@@ -1270,10 +1275,6 @@ cc_fuzz {
local_include_dirs: ["config/arm64"],
},
- riscv64: {
- local_include_dirs: ["config/generic"],
- },
-
x86: {
local_include_dirs: ["config/x86"],
},
diff --git a/CHANGELOG b/CHANGELOG
index cd4e8ba43..3fb2d19bb 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,39 @@
+2023-01-31 v1.13.0 "Ugly Duckling"
+ This release includes more Neon and AVX2 optimizations, adds a new codec
+ control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+ numerous bug fixes.
+
+ - Upgrading:
+ This release is ABI incompatible with the previous release.
+
+ New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+ GoogleTest is upgraded to v1.12.1.
+
+ .clang-format is upgraded to clang-format-11.
+
+ VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+ feature of using external rate control models for vp9.
+
+ - Enhancement:
+ Numerous improvements on Neon optimizations.
+ Numerous improvements on AVX2 optimizations.
+ Additional ARM targets added for Visual Studio.
+
+ - Bug fixes:
+ Fix to calculating internal stats when frame dropped.
+ Fix to segfault for external resize test in vp9.
+ Fix to build system with replacing egrep with grep -E.
+ Fix to a few bugs with external RTC rate control library.
+ Fix to make SVC work with VBR.
+ Fix to key frame setting in VP9 external RC.
+ Fix to -Wimplicit-int (Clang 16).
+ Fix to VP8 external RC for buffer levels.
+ Fix to VP8 external RC for dynamic update of layers.
+ Fix to VP9 auto level.
+ Fix to off-by-one error of max w/h in validate_config.
+ Fix to make SVC work for Profile 1.
+
2022-06-17 v1.12.0 "Torrent Duck"
This release adds optimizations for Loongarch, adds support for vp8 in the
real-time rate control library, upgrades GoogleTest to v1.11.0, updates
@@ -36,6 +72,7 @@
levels, and includes several improvements to NEON and numerous bug fixes.
- Upgrading:
+ This release is ABI incompatible with the previous release.
New codec control is added to get quantization parameters and loop filter
levels.
@@ -61,6 +98,7 @@
well as numerous bug fixes.
- Upgrading:
+ This release is ABI incompatible with the previous release.
New codec control is added to disable loopfilter for VP9.
New encoder control is added to disable feature to increase Q on overshoot
@@ -91,6 +129,7 @@
well as incremental improvements.
- Upgrading:
+ This release is ABI compatible with the previous release.
NV12 support is added to this release.
A new interface is added for VP9 rate control. The new library libvp9rc.a
must be linked by applications.
@@ -114,12 +153,14 @@
This release collects incremental improvements to many aspects of the library.
- Upgrading:
+ This release is ABI compatible with the previous release.
ARCH_* defines have been removed in favor of VPX_ARCH_*.
2019-07-15 v1.8.1 "Orpington Duck"
This release collects incremental improvements to many aspects of the library.
- Upgrading:
+ This release is ABI incompatible with the previous release.
VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
The --sdk-path option has been removed. If you were using it to build for
@@ -138,7 +179,8 @@
This release focused on encoding performance for realtime and VOD use cases.
- Upgrading:
- This adds and improves several vp9 controls. Most are related to SVC:
+ This release is ABI incompatible with the previous release. This adds and
+ improves several vp9 controls. Most are related to SVC:
VP9E_SET_SVC_FRAME_DROP_LAYER:
- Frame dropping in SVC.
VP9E_SET_SVC_INTER_LAYER_PRED:
diff --git a/README.android b/README.android
index 38780acec..bbcf66cba 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
Name: libvpx
URL: http://www.webmproject.org
-Version: v1.12.0
+Version: v1.13.0
License: BSD
License File: libvpx/LICENSE
-Date: Thursday June 30 2022
-Branch: origin/torrent
-Commit: 03265cd42b3783532de72f2ded5436652e6f5ce3
+Date: Friday February 10 2023
+Branch: v1.13.0
+Commit: d6eb9696aa72473c1a11d34d928d35a3acc0c9a9
Description:
Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index 7dfba96ef..9858b39c9 100644
--- a/README.version
+++ b/README.version
@@ -1,5 +1,5 @@
-URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.12.0.tar.gz
-Version: v1.12.0
+URL: https://chromium.googlesource.com/webm/libvpx/
+Version: v1.13.0
BugComponent: 42195
Owners: jzern, jianj
Local Modifications:
diff --git a/build/make/Android.mk b/build/make/Android.mk
index b8032e67a..ba24f541b 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -8,6 +8,8 @@
## be found in the AUTHORS file in the root of the source tree.
##
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
#
# This file is to be used for compiling libvpx for Android using the NDK.
# In an Android project place a libvpx checkout in the jni directory.
@@ -212,3 +214,4 @@ endif
ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
$(call import-module,android/cpufeatures)
endif
+endif # NDK_ROOT
diff --git a/build/make/Makefile b/build/make/Makefile
index b7a873cc8..5c38c18e5 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -21,9 +21,9 @@ all: .DEFAULT
clean:: .DEFAULT
exampletest: .DEFAULT
install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
+test: .DEFAULT
+test-no-data-check: .DEFAULT
+testdata: .DEFAULT
utiltest: .DEFAULT
exampletest-no-data-check utiltest-no-data-check: .DEFAULT
test_%: .DEFAULT ;
@@ -111,13 +111,13 @@ exampletest:
.PHONY: install
install::
.PHONY: test
-test::
+test:
.PHONY: testdata
-testdata::
+testdata:
.PHONY: utiltest
utiltest:
.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
+test-no-data-check:
exampletest-no-data-check utiltest-no-data-check:
# Force to realign stack always on OS/2
@@ -465,6 +465,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
all: $(BUILD_TARGETS)
install:: $(INSTALL_TARGETS)
dist: $(INSTALL_TARGETS)
-test::
+test:
.SUFFIXES: # Delete default suffix rules
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 581042e38..4bf090f00 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -791,7 +791,7 @@ process_common_toolchain() {
tgt_isa=x86_64
tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
;;
- *darwin2[0-1]*)
+ *darwin2[0-2]*)
tgt_isa=`uname -m`
tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
;;
@@ -940,7 +940,7 @@ process_common_toolchain() {
add_cflags "-mmacosx-version-min=10.15"
add_ldflags "-mmacosx-version-min=10.15"
;;
- *-darwin2[0-1]-*)
+ *-darwin2[0-2]-*)
add_cflags "-arch ${toolchain%%-*}"
add_ldflags "-arch ${toolchain%%-*}"
;;
@@ -1511,7 +1511,7 @@ EOF
# Try to find which inline keywords are supported
check_cc <<EOF && INLINE="inline"
-static inline function() {}
+static inline int function(void) {}
EOF
# Almost every platform uses pthreads.
diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh
index 6a7bff9eb..3bd4d125f 100755
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
[ -n "$srcfile" ] || show_help
sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
#" restore editor state
for inc in ${includes}; do
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 9c9726842..f4edeaad5 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -488,7 +488,8 @@ if ($opts{arch} eq 'x86') {
arm;
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
@ALL_ARCHS = filter(qw/neon/);
- &require("neon");
+ @REQUIRES = filter(qw/neon/);
+ &require(@REQUIRES);
arm;
} elsif ($opts{arch} =~ /^ppc/ ) {
@ALL_ARCHS = filter(qw/vsx/);
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index 01065e667..b2b2fc2dc 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
@@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index 99abbb974..565105892 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran
#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
@@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon
void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon
unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon
unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon
unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon
unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon
unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon
unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon
unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon
unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon
unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon
unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon
unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon
unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon
unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon
unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon
unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon
unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon
void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon
void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon
unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon
unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon
unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon
unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon
unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon
unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon
unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon
unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon
unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon
unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon
unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon
unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon
unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon
unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon
unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon
unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon
void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon
void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon
unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon
unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon
unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon
unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon
unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon
unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon
unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon
unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon
unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon
unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon
unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon
unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon
unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon
unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon
unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon
unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
@@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon
void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
@@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const
#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon
void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon
void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon
void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon
void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon
void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon
void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon
void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp
#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon
unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon
unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon
unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon
unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon
unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon
unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon
unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon
unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon
unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon
unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon
unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon
unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon
unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
#define vpx_highbd_satd vpx_highbd_satd_c
void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
-#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon
void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index a90ab60d9..fa2cc50fd 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 12
+#define VERSION_MINOR 13
#define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "1559-gcd7dbca207"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING " v1.13.0-1559-gcd7dbca207"
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index 01065e667..b2b2fc2dc 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
@@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index 99abbb974..565105892 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran
#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
@@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon
void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon
unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon
unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon
unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon
unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon
unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon
unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon
unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon
unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon
unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon
unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon
unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon
unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon
unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon
unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon
unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon
unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon
void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon
void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon
unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon
unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon
unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon
unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon
unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon
unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon
unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon
unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon
unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon
unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon
unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon
unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon
unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon
unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon
unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon
unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon
void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon
void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon
unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon
unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon
unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon
unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon
uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon
uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon
uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon
uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon
uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon
uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon
unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon
unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon
unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon
unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon
unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon
unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon
unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon
unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon
unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon
unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon
unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon
unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon
unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
@@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon
void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
@@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const
#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon
void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon
void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon
void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon
void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon
void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon
void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon
void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp
#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon
unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon
unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon
unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon
unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon
unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon
unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon
unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon
unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon
unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon
unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon
unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon
unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon
unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
#define vpx_highbd_satd vpx_highbd_satd_c
void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
-#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon
void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index a90ab60d9..fa2cc50fd 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 12
+#define VERSION_MINOR 13
#define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "1559-gcd7dbca207"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING " v1.13.0-1559-gcd7dbca207"
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index a90ab60d9..fa2cc50fd 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 12
+#define VERSION_MINOR 13
#define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "1559-gcd7dbca207"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING " v1.13.0-1559-gcd7dbca207"
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index cff5e7f63..580d55a28 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -106,10 +106,12 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_sse2
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_ssse3
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index 8b94dd89f..91242deee 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -833,7 +833,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p);
#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2
void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index a90ab60d9..fa2cc50fd 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 12
+#define VERSION_MINOR 13
#define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "1559-gcd7dbca207"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING " v1.13.0-1559-gcd7dbca207"
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 284453f06..22401f1c0 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -834,7 +834,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p);
#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2
void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index a90ab60d9..fa2cc50fd 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 12
+#define VERSION_MINOR 13
#define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "1559-gcd7dbca207"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING " v1.13.0-1559-gcd7dbca207"
diff --git a/configure b/configure
index beea65032..ae289f77b 100755
--- a/configure
+++ b/configure
@@ -101,9 +101,12 @@ all_platforms="${all_platforms} arm64-android-gcc"
all_platforms="${all_platforms} arm64-darwin-gcc"
all_platforms="${all_platforms} arm64-darwin20-gcc"
all_platforms="${all_platforms} arm64-darwin21-gcc"
+all_platforms="${all_platforms} arm64-darwin22-gcc"
all_platforms="${all_platforms} arm64-linux-gcc"
all_platforms="${all_platforms} arm64-win64-gcc"
all_platforms="${all_platforms} arm64-win64-vs15"
+all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs17"
all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8
all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8
all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8
@@ -112,6 +115,8 @@ all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8
all_platforms="${all_platforms} armv7-win32-gcc"
all_platforms="${all_platforms} armv7-win32-vs14"
all_platforms="${all_platforms} armv7-win32-vs15"
+all_platforms="${all_platforms} armv7-win32-vs16"
+all_platforms="${all_platforms} armv7-win32-vs17"
all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} armv8-linux-gcc"
all_platforms="${all_platforms} loongarch32-linux-gcc"
@@ -157,6 +162,7 @@ all_platforms="${all_platforms} x86_64-darwin18-gcc"
all_platforms="${all_platforms} x86_64-darwin19-gcc"
all_platforms="${all_platforms} x86_64-darwin20-gcc"
all_platforms="${all_platforms} x86_64-darwin21-gcc"
+all_platforms="${all_platforms} x86_64-darwin22-gcc"
all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
all_platforms="${all_platforms} x86_64-linux-gcc"
all_platforms="${all_platforms} x86_64-linux-icc"
@@ -666,11 +672,18 @@ process_toolchain() {
check_add_cxxflags -Wno-psabi
fi
+ # Enforce C++11 compatibility.
+ check_add_cxxflags -Wc++14-extensions
+ check_add_cxxflags -Wc++17-extensions
+ check_add_cxxflags -Wc++20-extensions
+
# disable some warnings specific to libyuv.
check_cxxflags -Wno-missing-declarations \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
check_cxxflags -Wno-missing-prototypes \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+ check_cxxflags -Wno-pass-failed \
+ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
check_cxxflags -Wno-unused-parameter \
&& LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
fi
diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index 08bda0e5c..003096e70 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -552,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
iter = NULL;
while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
switch (cx_pkt->kind) {
- case VPX_CODEC_PSNR_PKT: {
- }
- ++si->psnr_pkt_received;
- break;
- default: { break; }
+ case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break;
+ default: break;
}
}
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index e85dbf8e7..d287e5831 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -1146,7 +1146,9 @@ int main(int argc, const char **argv) {
cx_pkt->data.twopass_stats.sz);
break;
}
- default: { break; }
+ default: {
+ break;
+ }
}
#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
diff --git a/libs.doxy_template b/libs.doxy_template
index 1eacc8fe2..1ee442af3 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -654,12 +654,6 @@ VERBATIM_HEADERS = YES
ALPHABETICAL_INDEX = NO
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX = 5
-
# In case all classes in a project start with a common prefix, all
# classes will be put under the same header in the alphabetical index.
# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
@@ -1099,32 +1093,10 @@ ALLEXTERNALS = NO
EXTERNAL_GROUPS = YES
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH = /usr/bin/perl
-
#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
-
-MSCGEN_PATH =
-
# If set to YES, the inheritance and collaboration graphs will hide
# inheritance and usage relations if the target is undocumented
# or is not a class.
@@ -1138,10 +1110,14 @@ HIDE_UNDOC_RELATIONS = YES
HAVE_DOT = NO
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
CLASS_GRAPH = YES
diff --git a/libs.mk b/libs.mk
index 00e49a19d..1f7f03aa3 100644
--- a/libs.mk
+++ b/libs.mk
@@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
# SO_VERSION_* then follow the rules in the link to detemine the new version
# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 8
+SO_VERSION_MINOR := 0
SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -446,13 +446,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
# YASM
$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
@echo " [CREATE] $@"
- @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+ @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
| awk '{print $$2 " equ " $$3}' > $@
else
ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
@echo " [CREATE] $@"
- @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+ @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
| awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
@echo " END" $(ADS2GAS) >> $@
CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
@@ -536,7 +536,7 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
esac \
)
-testdata:: $(LIBVPX_TEST_DATA)
+testdata: $(LIBVPX_TEST_DATA)
$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
[ -x "$$(which shasum)" ] && sha1sum=shasum;\
[ -x "$$(which sha1)" ] && sha1sum=sha1;\
@@ -709,15 +709,15 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS)
define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
+test: test_shard.$(1)
+test-no-data-check: test_shard_ndc.$(1)
test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
@set -e; \
export GTEST_SHARD_INDEX=$(1); \
export GTEST_TOTAL_SHARDS=$(2); \
$(LIBVPX_TEST_BIN)
test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
+.PHONY: test_shard.$(1) test_shard_ndc.$(1)
endef
NUM_SHARDS := 10
diff --git a/md5_utils.c b/md5_utils.c
index c4106525f..abd8d43c3 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
* reflect the addition of 16 longwords of new data. MD5Update blocks
* the data and converts bytes into longwords for this routine.
*/
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
- UWORD32 const in[16]) {
+VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+ UWORD32 buf[4], UWORD32 const in[16]) {
UWORD32 a, b, c, d;
a = buf[0];
diff --git a/test/acm_random.h b/test/acm_random.h
index 3458340a1..c7122b933 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -28,43 +28,43 @@ class ACMRandom {
explicit ACMRandom(int seed) : random_(seed) {}
void Reset(int seed) { random_.Reseed(seed); }
- uint16_t Rand16(void) {
+ uint16_t Rand16() {
const uint32_t value =
random_.Generate(testing::internal::Random::kMaxRange);
return (value >> 15) & 0xffff;
}
- int32_t Rand20Signed(void) {
+ int32_t Rand20Signed() {
// Use 20 bits: values between 524287 and -524288.
const uint32_t value = random_.Generate(1048576);
return static_cast<int32_t>(value) - 524288;
}
- int16_t Rand16Signed(void) {
+ int16_t Rand16Signed() {
// Use 16 bits: values between 32767 and -32768.
return static_cast<int16_t>(random_.Generate(65536));
}
- int16_t Rand13Signed(void) {
+ int16_t Rand13Signed() {
// Use 13 bits: values between 4095 and -4096.
const uint32_t value = random_.Generate(8192);
return static_cast<int16_t>(value) - 4096;
}
- int16_t Rand9Signed(void) {
+ int16_t Rand9Signed() {
// Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
const uint32_t value = random_.Generate(512);
return static_cast<int16_t>(value) - 256;
}
- uint8_t Rand8(void) {
+ uint8_t Rand8() {
const uint32_t value =
random_.Generate(testing::internal::Random::kMaxRange);
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 23) & 0xff;
}
- uint8_t Rand8Extremes(void) {
+ uint8_t Rand8Extremes() {
// Returns a random value near 0 or near 255, to better exercise
// saturation behavior.
const uint8_t r = Rand8();
@@ -82,7 +82,7 @@ class ACMRandom {
int operator()(int n) { return PseudoUniform(n); }
- static int DeterministicSeed(void) { return 0xbaba; }
+ static int DeterministicSeed() { return 0xbaba; }
private:
testing::internal::Random random_;
diff --git a/test/android/Android.mk b/test/android/Android.mk
index 87155fcb5..9a7533ebb 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -10,6 +10,9 @@
# The test app itself runs on the command line through adb shell
# The paths are really messed up as the libvpx make file
# expects to be made from a parent directory.
+
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
CUR_WD := $(call my-dir)
BINDINGS_DIR := $(CUR_WD)/../../..
LOCAL_PATH := $(CUR_WD)/../../..
@@ -61,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
# some test files depend on *_rtcd.h, ensure they're generated first.
$(eval $(call rtcd_dep_template))
include $(BUILD_EXECUTABLE)
+endif # NDK_ROOT
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 3977a2d0b..70aeab8d7 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -22,13 +22,14 @@ namespace {
using ::libvpx_test::ACMRandom;
using ::libvpx_test::Buffer;
-typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
- const uint8_t *c, int c_stride);
-
-uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+template <typename Pixel>
+Pixel avg_with_rounding(Pixel a, Pixel b) {
+ return (a + b + 1) >> 1;
+}
-void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
- int width, int height, Buffer<uint8_t> *avg) {
+template <typename Pixel>
+void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref,
+ int width, int height, Buffer<Pixel> *avg) {
ASSERT_NE(avg->TopLeftPixel(), nullptr);
ASSERT_NE(pred.TopLeftPixel(), nullptr);
ASSERT_NE(ref.TopLeftPixel(), nullptr);
@@ -36,12 +37,16 @@ void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
avg->TopLeftPixel()[y * avg->stride() + x] =
- avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
- ref.TopLeftPixel()[y * ref.stride() + x]);
+ avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x],
+ ref.TopLeftPixel()[y * ref.stride() + x]);
}
}
}
+using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
+ const uint8_t *c, int c_stride);
+
+template <int bitdepth, typename Pixel>
class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
public:
virtual void SetUp() {
@@ -49,15 +54,19 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
rnd_.Reset(ACMRandom::DeterministicSeed());
}
+ void TestSizeCombinations();
+ void TestCompareReferenceRandom();
+ void TestSpeed();
+
protected:
AvgPredFunc avg_pred_func_;
ACMRandom rnd_;
};
-TEST_P(AvgPredTest, SizeCombinations) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
// This is called as part of the sub pixel variance. As such it must be one of
// the variance block sizes.
-
for (int width_pow = 2; width_pow <= 6; ++width_pow) {
for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
++height_pow) {
@@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) {
const int width = 1 << width_pow;
const int height = 1 << height_pow;
// Only the reference buffer may have a stride not equal to width.
- Buffer<uint8_t> ref =
- Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
ASSERT_TRUE(ref.Init());
- Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(pred.Init());
- Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(avg_ref.Init());
- Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(avg_chk.Init());
+ const int bitdepth_mask = (1 << bitdepth) - 1;
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
- ref.Set(&rnd_, &ACMRandom::Rand8);
- pred.Set(&rnd_, &ACMRandom::Rand8);
-
- reference_pred(pred, ref, width, height, &avg_ref);
- ASM_REGISTER_STATE_CHECK(
- avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
- height, ref.TopLeftPixel(), ref.stride()));
+ reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+ ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+ (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+ width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
if (HasFailure()) {
@@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) {
}
}
-TEST_P(AvgPredTest, CompareReferenceRandom) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
const int width = 64;
const int height = 32;
- Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+ Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
ASSERT_TRUE(ref.Init());
- Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(pred.Init());
- Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(avg_ref.Init());
- Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(avg_chk.Init());
for (int i = 0; i < 500; ++i) {
- ref.Set(&rnd_, &ACMRandom::Rand8);
- pred.Set(&rnd_, &ACMRandom::Rand8);
+ const int bitdepth_mask = (1 << bitdepth) - 1;
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
- reference_pred(pred, ref, width, height, &avg_ref);
- ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
- pred.TopLeftPixel(), width, height,
- ref.TopLeftPixel(), ref.stride()));
+ reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+ ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+ (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+ width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
if (HasFailure()) {
printf("Width: %d Height: %d\n", width, height);
@@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
}
}
-TEST_P(AvgPredTest, DISABLED_Speed) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
for (int width_pow = 2; width_pow <= 6; ++width_pow) {
for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
++height_pow) {
@@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
const int width = 1 << width_pow;
const int height = 1 << height_pow;
- Buffer<uint8_t> ref =
- Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
ASSERT_TRUE(ref.Init());
- Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(pred.Init());
- Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+ Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
ASSERT_TRUE(avg.Init());
-
- ref.Set(&rnd_, &ACMRandom::Rand8);
- pred.Set(&rnd_, &ACMRandom::Rand8);
+ const int bitdepth_mask = (1 << bitdepth) - 1;
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+ }
+ }
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
- for (int i = 0; i < 10000000 / (width * height); ++i) {
- avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
- ref.TopLeftPixel(), ref.stride());
+ for (int i = 0; i < 100000000 / (width * height); ++i) {
+ avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
+ (uint8_t *)pred.TopLeftPixel(), width, height,
+ (uint8_t *)ref.TopLeftPixel(), ref.stride());
}
vpx_usec_timer_mark(&timer);
@@ -166,26 +201,64 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
}
}
-INSTANTIATE_TEST_SUITE_P(C, AvgPredTest,
+using AvgPredTestLBD = AvgPredTest<8, uint8_t>;
+
+TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_c));
#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_sse2));
#endif // HAVE_SSE2
#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_neon));
#endif // HAVE_NEON
#if HAVE_VSX
-INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_vsx));
#endif // HAVE_VSX
#if HAVE_LSX
-INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD,
::testing::Values(&vpx_comp_avg_pred_lsx));
#endif // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h,
+ const uint16_t *c, int c_stride);
+
+template <HighbdAvgPredFunc fn>
+void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h,
+ const uint8_t *c, int c_stride) {
+ fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride);
+}
+
+using AvgPredTestHBD = AvgPredTest<12, uint16_t>;
+
+TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AvgPredTestHBD,
+ ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AvgPredTestHBD,
+ ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif // HAVE_SSE2
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 06837d809..d4ef7ae13 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -789,13 +789,23 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_SUITE_P(
NEON, Trans16x16DCT,
::testing::Values(make_tuple(&vpx_fdct16x16_neon,
&vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+ NEON, Trans16x16DCT,
+ ::testing::Values(
+ make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12),
+ make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0,
+ VPX_BITS_8)));
+#endif // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_SUITE_P(
SSE2, Trans16x16DCT,
diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc
index 8d0e3a912..e57fa0f48 100644
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P(
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_SUITE_P(
NEON, PartialFdctTest,
- ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
- make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+ ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8),
make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12),
+ make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10),
make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
#else
INSTANTIATE_TEST_SUITE_P(
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 2182f87e5..0304029bd 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -539,6 +539,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_neon_func_info[] = {
+ { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
+ &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
+ { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+ &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+ { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+ &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+ /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+ &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+};
+#else
static const FuncInfo dct_neon_func_info[4] = {
{ &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
1 },
@@ -549,12 +561,15 @@ static const FuncInfo dct_neon_func_info[4] = {
{ &fdct_wrapper<vpx_fdct32x32_neon>,
&idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
};
+#endif // CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_SUITE_P(
NEON, TransDCT,
- ::testing::Combine(::testing::Range(0, 4),
- ::testing::Values(dct_neon_func_info),
- ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+ ::testing::Combine(
+ ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) /
+ sizeof(dct_neon_func_info[0]))),
+ ::testing::Values(dct_neon_func_info), ::testing::Values(0),
+ ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
#endif // HAVE_NEON
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
@@ -652,6 +667,8 @@ static const FuncInfo ht_neon_func_info[] = {
#if CONFIG_VP9_HIGHBITDEPTH
{ &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
2 },
+ { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>,
+ 4, 2 },
{ &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
2 },
{ &vp9_highbd_fht16x16_c,
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 6f61c7750..ecdf92834 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -233,8 +233,8 @@ TEST(EncodeAPI, SetRoi) {
roi.roi_map = roi_map;
// VP8 only. This value isn't range checked.
roi.static_threshold[1] = 1000;
- roi.static_threshold[2] = INT_MIN;
- roi.static_threshold[3] = INT_MAX;
+ roi.static_threshold[2] = UINT_MAX / 2 + 1;
+ roi.static_threshold[3] = UINT_MAX;
for (const auto delta : { -63, -1, 0, 1, 63 }) {
for (int i = 0; i < 8; ++i) {
@@ -336,7 +336,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
for (const auto *iface : kCodecIfaces) {
SCOPED_TRACE(vpx_codec_iface_name(iface));
for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
- vpx_codec_enc_cfg_t cfg;
+ vpx_codec_enc_cfg_t cfg = {};
struct Encoder {
~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
vpx_codec_ctx_t ctx = {};
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 1ce39eaef..d3feeee34 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) {
}
}
-void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+void Encoder::EncodeFrame(VideoSource *video,
+ const vpx_enc_frame_flags_t frame_flags) {
if (video->img()) {
EncodeFrameInternal(*video, frame_flags);
} else {
@@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
}
void Encoder::EncodeFrameInternal(const VideoSource &video,
- const unsigned long frame_flags) {
+ const vpx_enc_frame_flags_t frame_flags) {
vpx_codec_err_t res;
const vpx_image_t *img = video.img();
@@ -169,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
ASSERT_TRUE(passes_ == 1 || passes_ == 2);
for (unsigned int pass = 0; pass < passes_; pass++) {
- last_pts_ = 0;
+ vpx_codec_pts_t last_pts = 0;
if (passes_ == 1) {
cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -225,8 +226,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
has_dxdata = true;
}
- ASSERT_GE(pkt->data.frame.pts, last_pts_);
- last_pts_ = pkt->data.frame.pts;
+ ASSERT_GE(pkt->data.frame.pts, last_pts);
+ last_pts = pkt->data.frame.pts;
FramePktHook(pkt);
break;
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 7085945f6..b57df8529 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -103,7 +103,7 @@ class Encoder {
}
// This is a thin wrapper around vpx_codec_encode(), so refer to
// vpx_encoder.h for its semantics.
- void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+ void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags);
// Convenience wrapper for EncodeFrame()
void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
@@ -184,7 +184,7 @@ class Encoder {
// Encode an image
void EncodeFrameInternal(const VideoSource &video,
- const unsigned long frame_flags);
+ vpx_enc_frame_flags_t frame_flags);
// Flush the encoder on EOS
void Flush();
@@ -206,8 +206,7 @@ class Encoder {
class EncoderTest {
protected:
explicit EncoderTest(const CodecFactory *codec)
- : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
- last_pts_(0) {
+ : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) {
// Default to 1 thread.
cfg_.g_threads = 1;
}
@@ -290,8 +289,7 @@ class EncoderTest {
unsigned long deadline_;
TwopassStatsStore stats_;
unsigned long init_flags_;
- unsigned long frame_flags_;
- vpx_codec_pts_t last_pts_;
+ vpx_enc_frame_flags_t frame_flags_;
};
} // namespace libvpx_test
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 45a327ec2..45138f14b 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
++tot_frame_number_;
}
- virtual void EndPassHook(void) {
+ virtual void EndPassHook() {
duration_ = (last_pts_ + 1) * timebase_;
if (cfg_.ts_number_layers > 1) {
for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index d85c193e0..8a0eb71ba 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -111,7 +111,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
ASSERT_TRUE(passes_ == 1 || passes_ == 2);
for (unsigned int pass = 0; pass < passes_; pass++) {
- last_pts_ = 0;
+ vpx_codec_pts_t last_pts = 0;
if (passes_ == 1) {
cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -144,8 +144,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
again = true;
switch (pkt->kind) {
case VPX_CODEC_CX_FRAME_PKT:
- ASSERT_GE(pkt->data.frame.pts, last_pts_);
- last_pts_ = pkt->data.frame.pts;
+ ASSERT_GE(pkt->data.frame.pts, last_pts);
+ last_pts = pkt->data.frame.pts;
FramePktHook(pkt);
break;
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 10b1e79c1..f904e814a 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -264,7 +264,8 @@ INSTANTIATE_TEST_SUITE_P(
INSTANTIATE_TEST_SUITE_P(
NEON, HadamardLowbdTest,
::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
- HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16)));
+ HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16),
+ HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32)));
#endif // HAVE_NEON
// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dc28dc628..9095d96a8 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -47,7 +47,7 @@ class MD5 {
MD5Update(&md5_, data, static_cast<uint32_t>(size));
}
- const char *Get(void) {
+ const char *Get() {
static const char hex[16] = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 775f7f36a..27d5ffa90 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,7 +7,11 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+
#include <limits.h>
+
+#include <memory>
+
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
@@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
- unsigned char *expected_output = new unsigned char[rows_ * cols_];
+ std::unique_ptr<unsigned char[]> expected_output(
+ new unsigned char[rows_ * cols_]);
ASSERT_NE(expected_output, nullptr);
- SetRows(expected_output, rows_, cols_, cols_);
+ SetRows(expected_output.get(), rows_, cols_, cols_);
RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
- expected_output);
-
- delete[] expected_output;
+ expected_output.get());
}
TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
diff --git a/test/resize_test.cc b/test/resize_test.cc
index c57170ff9..715bb9d70 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -95,10 +95,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
unsigned int initial_h, unsigned int *w,
unsigned int *h, bool flag_codec,
bool smaller_width_larger_size_) {
+ *w = initial_w;
+ *h = initial_h;
+
if (smaller_width_larger_size_) {
if (frame < 30) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 100) {
@@ -109,8 +110,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 10) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 20) {
@@ -124,8 +123,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 40) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 50) {
@@ -139,8 +136,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 70) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 80) {
@@ -159,8 +154,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 110) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 120) {
@@ -179,8 +172,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 150) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 160) {
@@ -199,8 +190,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 190) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 200) {
@@ -219,8 +208,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 230) {
- *w = initial_w;
- *h = initial_h;
return;
}
if (frame < 240) {
@@ -234,8 +221,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
return;
}
if (frame < 260) {
- *w = initial_w;
- *h = initial_h;
return;
}
// Go down very low.
@@ -248,13 +233,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
// Cases that only works for VP9.
// For VP9: Swap width and height of original.
if (frame < 320) {
- *w = initial_h;
- *h = initial_w;
return;
}
}
- *w = initial_w;
- *h = initial_h;
}
class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
@@ -578,6 +559,8 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
}
}
+// TODO(https://crbug.com/webm/1642): This causes a segfault in
+// init_encode_frame_mb_context().
TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
ResizingVideoSource video;
video.flag_codec_ = true;
@@ -794,8 +777,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
}
VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
- ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
::testing::Values(::libvpx_test::kOnePassBest));
VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 2506f1adb..0896c77f1 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -311,13 +311,13 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
ASSERT_EQ(reference_sad, exp_sad);
}
- void Run() {
+ void Run() override {
params_.func(source_data_, source_stride_, reference_data_,
reference_stride_);
}
};
-class SADavgTest : public SADTestBase<SadMxNAvgParam> {
+class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
public:
SADavgTest() : SADTestBase(GetParam()) {}
@@ -338,6 +338,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> {
ASSERT_EQ(reference_sad, exp_sad);
}
+
+ void Run() override {
+ params_.func(source_data_, source_stride_, reference_data_,
+ reference_stride_, second_pred_);
+ }
};
TEST_P(SADTest, MaxRef) {
@@ -437,6 +442,19 @@ TEST_P(SADavgTest, ShortSrc) {
source_stride_ = tmp_stride;
}
+TEST_P(SADavgTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, params_.width);
+
+ RunNTimes(kCountSpeedTestBlock);
+
+ char title[16];
+ snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+ PrintMedian(title);
+}
+
TEST_P(SADx4Test, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(GetReference(0), reference_stride_, mask_);
@@ -517,14 +535,12 @@ TEST_P(SADx4Test, DISABLED_Speed) {
uint32_t reference_sad[4];
DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
vpx_usec_timer timer;
-
- memset(reference_sad, 0, sizeof(reference_sad));
- SADs(exp_sad);
+ for (int block = 0; block < 4; ++block) {
+ reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
+ }
vpx_usec_timer_start(&timer);
for (int i = 0; i < kCountSpeedTestBlock; ++i) {
- for (int block = 0; block < 4; ++block) {
- reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
- }
+ SADs(exp_sad);
}
vpx_usec_timer_mark(&timer);
for (int block = 0; block < 4; ++block) {
@@ -729,6 +745,45 @@ const SadMxNParam neon_tests[] = {
SadMxNParam(8, 4, &vpx_sad8x4_neon),
SadMxNParam(4, 8, &vpx_sad4x8_neon),
SadMxNParam(4, 4, &vpx_sad4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8),
+ SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8),
+ SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8),
+ SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8),
+ SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8),
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8),
+ SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10),
+ SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10),
+ SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10),
+ SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10),
+ SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10),
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10),
+ SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12),
+ SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12),
+ SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12),
+ SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12),
+ SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12),
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
};
INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
@@ -746,6 +801,47 @@ const SadMxNAvgParam avg_neon_tests[] = {
SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8),
+ SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8),
+ SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8),
+ SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8),
+ SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8),
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8),
+ SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10),
+ SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10),
+ SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10),
+ SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10),
+ SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10),
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10),
+ SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12),
+ SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12),
+ SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12),
+ SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12),
+ SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12),
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
@@ -763,6 +859,44 @@ const SadMxNx4Param x4d_neon_tests[] = {
SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8),
+ SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8),
+ SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8),
+ SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8),
+ SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8),
+ SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10),
+ SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10),
+ SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10),
+ SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10),
+ SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10),
+ SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12),
+ SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12),
+ SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12),
+ SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12),
+ SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
#endif // HAVE_NEON
@@ -948,6 +1082,34 @@ const SadMxNParam avx2_tests[] = {
SadMxNParam(32, 64, &vpx_sad32x64_avx2),
SadMxNParam(32, 32, &vpx_sad32x32_avx2),
SadMxNParam(32, 16, &vpx_sad32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
+ SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+ SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+ SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+ SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+ SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+ SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+ SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
+ SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
+ SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
+ SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
@@ -957,12 +1119,64 @@ const SadMxNAvgParam avg_avx2_tests[] = {
SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2),
SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+ SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12),
+ SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12),
+ SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
+ SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
+ SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
+ SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
+ SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
+ SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
const SadMxNx4Param x4d_avx2_tests[] = {
SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
+ SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
+ SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
+ SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
+ SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
+ SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
+ SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
+ SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
+ SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
+#endif // CONFIG_VP9_HIGHBITDEPTH
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 291cb0128..484252ca4 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -548,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
}
if (!single_layer_resize_) {
- ASSERT_EQ(pkt->data.frame.width[sl],
- top_sl_width_ * svc_params_.scaling_factor_num[sl] /
- svc_params_.scaling_factor_den[sl]);
-
- ASSERT_EQ(pkt->data.frame.height[sl],
- top_sl_height_ * svc_params_.scaling_factor_num[sl] /
- svc_params_.scaling_factor_den[sl]);
+ unsigned int scaled_width = top_sl_width_ *
+ svc_params_.scaling_factor_num[sl] /
+ svc_params_.scaling_factor_den[sl];
+ if (scaled_width % 2 != 0) scaled_width += 1;
+ ASSERT_EQ(pkt->data.frame.width[sl], scaled_width);
+ unsigned int scaled_height = top_sl_height_ *
+ svc_params_.scaling_factor_num[sl] /
+ svc_params_.scaling_factor_den[sl];
+ if (scaled_height % 2 != 0) scaled_height += 1;
+ ASSERT_EQ(pkt->data.frame.height[sl], scaled_height);
} else if (superframe_count_ > 0) {
if (pkt->data.frame.width[sl] < prev_frame_width[sl] &&
pkt->data.frame.height[sl] < prev_frame_height[sl])
@@ -568,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
}
}
- virtual void EndPassHook(void) {
+ virtual void EndPassHook() {
if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
duration_ = (last_pts_ + 1) * timebase_;
for (int sl = 0; sl < number_spatial_layers_; ++sl) {
@@ -678,6 +681,152 @@ class DatarateOnePassCbrSvcSingleBR
}
};
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for 4:4:4 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) {
+ SetSvcConfig(3, 3);
+ ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+ cfg_.g_profile = 1;
+ cfg_.g_bit_depth = VPX_BITS_8;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_threads = 1;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.kf_max_dist = 9999;
+
+ top_sl_width_ = 352;
+ top_sl_height_ = 288;
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ AssignLayerBitrates();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+ 1.15);
+#if CONFIG_VP9_DECODER
+ // The non-reference frames are expected to be mismatched frames as the
+ // encoder will avoid loopfilter on these frames.
+ EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers, for 4:2:2 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
+ SetSvcConfig(2, 3);
+ ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20);
+ cfg_.g_profile = 1;
+ cfg_.g_bit_depth = VPX_BITS_8;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_threads = 1;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.kf_max_dist = 9999;
+
+ top_sl_width_ = 160;
+ top_sl_height_ = 90;
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ AssignLayerBitrates();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Use large under/over shoot thresholds as this is a very short clip,
+ // so not good for testing rate-targeting.
+ CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+ 1.7);
+#if CONFIG_VP9_DECODER
+ // The non-reference frames are expected to be mismatched frames as the
+ // encoder will avoid loopfilter on these frames.
+ EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 10bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) {
+ SetSvcConfig(3, 3);
+ ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20);
+ cfg_.g_profile = 2;
+ cfg_.g_bit_depth = VPX_BITS_10;
+ cfg_.g_input_bit_depth = VPX_BITS_10;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_threads = 1;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.kf_max_dist = 9999;
+
+ top_sl_width_ = 160;
+ top_sl_height_ = 90;
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ AssignLayerBitrates();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // TODO(marpan/jianj): Comment out the rate-target checking for now
+ // as superframe parsing to get frame size needs to be fixed for
+ // high bitdepth.
+ /*
+ // Use large under/over shoot thresholds as this is a very short clip,
+ // so not good for testing rate-targeting.
+ CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+ 1.7);
+ */
+#if CONFIG_VP9_DECODER
+ // The non-reference frames are expected to be mismatched frames as the
+ // encoder will avoid loopfilter on these frames.
+ EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 12bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) {
+ SetSvcConfig(3, 3);
+ ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20);
+ cfg_.g_profile = 2;
+ cfg_.g_bit_depth = VPX_BITS_12;
+ cfg_.g_input_bit_depth = VPX_BITS_12;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_threads = 1;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.kf_max_dist = 9999;
+
+ top_sl_width_ = 160;
+ top_sl_height_ = 90;
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ AssignLayerBitrates();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // TODO(marpan/jianj): Comment out the rate-target checking for now
+ // as superframe parsing to get frame size needs to be fixed for
+ // high bitdepth.
+ /*
+ // Use large under/over shoot thresholds as this is a very short clip,
+ // so not good for testing rate-targeting.
+ CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+ 1.7);
+ */
+#if CONFIG_VP9_DECODER
+ // The non-reference frames are expected to be mismatched frames as the
+ // encoder will avoid loopfilter on these frames.
+ EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
// temporal layer, with screen content mode on and same speed setting for all
// layers.
@@ -1054,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
#endif
}
+// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and
+// 3 temporal layers. Run VGA clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) {
+ SetSvcConfig(2, 3);
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.g_threads = 1;
+ cfg_.rc_dropframe_thresh = 30;
+ cfg_.kf_max_dist = 9999;
+ cfg_.rc_end_usage = VPX_VBR;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
+ top_sl_width_ = 640;
+ top_sl_height_ = 480;
+ const int bitrates[3] = { 200, 400, 600 };
+ cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+ ResetModel();
+ AssignLayerBitrates();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+ 1.3);
+#if CONFIG_VP9_DECODER
+ // The non-reference frames are expected to be mismatched frames as the
+ // encoder will avoid loopfilter on these frames.
+ EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
// Params: speed setting, layer framedrop control and index for bitrate array.
class DatarateOnePassCbrSvcFrameDropMultiBR
: public DatarateOnePassCbrSvc,
diff --git a/test/test.mk b/test/test.mk
index 6df457290..f60d8f823 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
LIBVPX_TEST_SRCS-yes += decode_test_driver.cc
LIBVPX_TEST_SRCS-yes += decode_test_driver.h
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 844a12534..0e4a0a5c0 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -133,7 +133,7 @@ vpx_config_option_enabled() {
vpx_config_option="${1}"
vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
- if echo "${config_line}" | egrep -q '1$'; then
+ if echo "${config_line}" | grep -E -q '1$'; then
echo yes
fi
}
@@ -222,7 +222,7 @@ filter_strings() {
if [ -n "${filter}" ]; then
for s in ${strings}; do
- if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+ if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then
filtered_strings="${filtered_strings} ${s}"
fi
done
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 80855052d..a6c8ef048 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -488,8 +488,8 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
- printf("Variance %dx%d time: %5d ms\n", width(), height(),
- elapsed_time / 1000);
+ printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(),
+ params_.bit_depth, elapsed_time / 1000);
}
////////////////////////////////////////////////////////////////////////////////
@@ -499,14 +499,21 @@ template <typename FunctionType>
void MainTestClass<FunctionType>::RefTestMse() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size(); ++j) {
- src_[j] = rnd_.Rand8();
- ref_[j] = rnd_.Rand8();
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
}
unsigned int sse1, sse2;
const int stride = width();
ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
- stride, &sse2, false, VPX_BITS_8);
+ stride, &sse2, use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2);
}
}
@@ -530,8 +537,15 @@ void MainTestClass<FunctionType>::RefTestSse() {
template <typename FunctionType>
void MainTestClass<FunctionType>::MaxTestMse() {
- memset(src_, 255, block_size());
- memset(ref_, 0, block_size());
+ if (!use_high_bit_depth()) {
+ memset(src_, 255, block_size());
+ memset(ref_, 0, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
unsigned int sse;
ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
const unsigned int expected = block_size() * 255 * 255;
@@ -854,25 +868,25 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
-/* TODO(debargha): This test does not support the highbd version
typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); }
INSTANTIATE_TEST_SUITE_P(
C, VpxHBDMseTest,
- ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
- MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
- MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
- MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
- MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
- MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
- MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
- MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
- MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
- MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
- MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
- MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
-*/
+ ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),
+ MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12),
+ MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12),
+ MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10),
+ MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10),
+ MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10),
+ MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10),
+ MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8),
+ MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8),
+ MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8),
+ MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8)));
+
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest);
INSTANTIATE_TEST_SUITE_P(
@@ -1138,22 +1152,15 @@ INSTANTIATE_TEST_SUITE_P(
SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
-/* TODO(debargha): This test does not support the highbd version
INSTANTIATE_TEST_SUITE_P(
SSE2, VpxHBDMseTest,
- ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2),
- MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2),
- MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2),
- MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2),
- MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2),
- MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2),
- MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2),
- MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2),
- MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2),
- MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2),
- MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2),
- MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2)));
-*/
+ ::testing::Values(
+ MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12),
+ MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10),
+ MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10),
+ MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8),
+ MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8)));
INSTANTIATE_TEST_SUITE_P(
SSE2, VpxHBDVarianceTest,
@@ -1495,6 +1502,224 @@ INSTANTIATE_TEST_SUITE_P(
SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxHBDVarianceTest,
+ ::testing::Values(
+ VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
+ VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12),
+ VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12),
+ VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12),
+ VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12),
+ VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12),
+ VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12),
+ VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12),
+ VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12),
+ VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12),
+ VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12),
+ VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12),
+ VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12),
+ VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10),
+ VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10),
+ VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10),
+ VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10),
+ VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10),
+ VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10),
+ VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10),
+ VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10),
+ VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10),
+ VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10),
+ VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10),
+ VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10),
+ VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10),
+ VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8),
+ VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8),
+ VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8),
+ VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8),
+ VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8),
+ VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8),
+ VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8),
+ VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8),
+ VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8),
+ VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8),
+ VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8),
+ VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8),
+ VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxHBDSubpelVarianceTest,
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
+ 12),
+ SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon,
+ 12),
+ SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon,
+ 12),
+ SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon,
+ 12),
+ SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon,
+ 12),
+ SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon,
+ 12),
+ SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon,
+ 12),
+ SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon,
+ 12),
+ SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon,
+ 12),
+ SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon,
+ 12),
+ SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
+ 12),
+ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
+ 10),
+ SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
+ 10),
+ SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon,
+ 10),
+ SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon,
+ 10),
+ SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon,
+ 10),
+ SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon,
+ 10),
+ SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon,
+ 10),
+ SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon,
+ 10),
+ SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon,
+ 10),
+ SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon,
+ 10),
+ SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
+ 10),
+ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
+ 8),
+ SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
+ 8),
+ SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon,
+ 8),
+ SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon,
+ 8),
+ SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon,
+ 8),
+ SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon,
+ 8),
+ SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon,
+ 8),
+ SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon,
+ 8),
+ SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
+ 8),
+ SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
+ SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+ 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxHBDSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance64x64_neon,
+ 12),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance64x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance32x64_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance32x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance32x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance16x32_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance16x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance16x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance8x16_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance8x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
+ 12),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance64x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance32x64_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance32x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance32x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance16x32_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance16x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance16x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance8x16_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance8x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance64x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance32x64_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance32x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance32x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance16x32_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance16x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance16x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance8x16_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance8x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+ 8)));
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
#if HAVE_MSA
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index dcd68a2d4..64a861d15 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -121,7 +121,7 @@ class DatarateTestLarge
++frame_number_;
}
- virtual void EndPassHook(void) {
+ virtual void EndPassHook() {
if (bits_total_) {
const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index ad310666e..7410f3c01 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -127,8 +127,7 @@ class Vp8RcInterfaceTest
encoder->Control(VP8E_SET_CPUUSED, -6);
encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
- }
- if (frame_params_.frame_type == INTER_FRAME) {
+ } else if (frame_params_.frame_type == INTER_FRAME) {
// Disable golden frame update.
frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 9930c754c..7e9180749 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -9,6 +9,7 @@
*/
#include "./vpx_config.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
#include "test/codec_factory.h"
#include "test/encode_test_driver.h"
#include "test/i420_video_source.h"
@@ -147,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
if (video->frame() == 0) {
encoder->Control(VP9E_SET_SVC, 1);
}
- vpx_svc_layer_id_t layer_id;
- layer_id.spatial_layer_id = 0;
- frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
- layer_id.temporal_layer_id =
- SetLayerId(video->frame(), cfg_.ts_number_layers);
- layer_id.temporal_layer_id_per_spatial[0] =
- SetLayerId(video->frame(), cfg_.ts_number_layers);
- encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+ if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ vpx_svc_layer_id_t layer_id;
+ frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+ layer_id.spatial_layer_id = 0;
+ layer_id.temporal_layer_id =
+ SetLayerId(video->frame(), cfg_.ts_number_layers);
+ layer_id.temporal_layer_id_per_spatial[0] =
+ SetLayerId(video->frame(), cfg_.ts_number_layers);
+ encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+ }
}
const vpx_rational_t tb = video->timebase();
timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -199,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
++tot_frame_number_;
}
- virtual void EndPassHook(void) {
+ virtual void EndPassHook() {
for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
++layer) {
duration_ = (last_pts_ + 1) * timebase_;
@@ -809,6 +812,135 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
<< " The datarate for the file is greater than target by too much!";
}
+using libvpx_test::ACMRandom;
+
+class DatarateTestVP9FrameQp
+ : public DatarateTestVP9,
+ public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+ DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
+ virtual ~DatarateTestVP9FrameQp() {}
+
+ protected:
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(::libvpx_test::kRealTime);
+ ResetModel();
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ set_cpu_used_ = 7;
+ DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+ frame_qp_ = static_cast<int>(rnd_.RandRange(64));
+ encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+ frame_++;
+ }
+
+ virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+ int qp = 0;
+ vpx_svc_layer_id_t layer_id;
+ if (frame_ >= total_frame_) return;
+ encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
+ ASSERT_EQ(frame_qp_, qp);
+ encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+ temporal_layer_id_ = layer_id.temporal_layer_id;
+ }
+
+ virtual void MismatchHook(const vpx_image_t * /*img1*/,
+ const vpx_image_t * /*img2*/) {
+ if (frame_ >= total_frame_) return;
+ ASSERT_TRUE(cfg_.temporal_layering_mode ==
+ VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+ temporal_layer_id_ == 2);
+ }
+
+ protected:
+ int total_frame_;
+
+ private:
+ ACMRandom rnd_;
+ int frame_qp_;
+ int frame_;
+ int temporal_layer_id_;
+};
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ total_frame_ = 400;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, total_frame_);
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+ cfg_.ss_number_layers = 1;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+
+ cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+ cfg_.rc_target_bitrate = 200;
+ total_frame_ = 400;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, total_frame_);
+ ResetModel();
+ cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+ cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+ cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+ cfg_.ss_number_layers = 1;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+
+ cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_error_resilient = 1;
+ total_frame_ = 400;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, total_frame_);
+ ResetModel();
+ cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+ cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+ cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
#if CONFIG_VP9_TEMPORAL_DENOISING
// Params: speed setting.
class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
@@ -943,6 +1075,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+ VP9, DatarateTestVP9FrameQp,
+ ::testing::Values(
+ static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif
+
VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV,
::testing::Range(5, 10),
::testing::Values(-5, -10, -15));
diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 60a350b84..2bfa6281d 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -16,28 +16,50 @@
#include "test/util.h"
#include "test/yuv_video_source.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/simple_encode.h"
#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx_dsp/vpx_dsp_common.h"
namespace {
constexpr int kModelMagicNumber = 51396;
constexpr uintptr_t PrivMagicNumber = 5566;
constexpr int kFrameNum = 5;
+constexpr int kFrameNumGOP = 30;
+constexpr int kFrameNumGOPShort = 4;
constexpr int kLosslessCodingIndex = 2;
+constexpr int kFixedGOPSize = 9;
+// The range check in vp9_cx_iface.c shows that the max
+// lag in buffer is MAX_LAG_BUFFERS (25):
+// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+constexpr int kMaxLagInFrames = 25;
+constexpr int kDefaultMinGfInterval = 4;
+constexpr int kDefaultMaxGfInterval = 16;
+// The active gf interval might change for each GOP
+// See function "get_active_gf_inverval_range".
+// The numbers below are from manual inspection.
+constexpr int kReadMinGfInterval = 5;
+constexpr int kReadMaxGfInterval = 13;
+const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
+const double kPsnrThreshold = 30.50;
struct ToyRateCtrl {
int magic_number;
int coding_index;
+
+ int gop_global_index;
+ int frames_since_key;
+ int show_index;
};
vpx_rc_status_t rc_create_model(void *priv,
const vpx_rc_config_t *ratectrl_config,
- vpx_rc_model_t *rate_ctrl_model_pt) {
+ vpx_rc_model_t *rate_ctrl_model_ptr) {
ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- EXPECT_NE(toy_rate_ctrl, nullptr);
+ if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
toy_rate_ctrl->magic_number = kModelMagicNumber;
toy_rate_ctrl->coding_index = -1;
- *rate_ctrl_model_pt = toy_rate_ctrl;
+ *rate_ctrl_model_ptr = toy_rate_ctrl;
EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
EXPECT_EQ(ratectrl_config->frame_width, 352);
EXPECT_EQ(ratectrl_config->frame_height, 288);
@@ -48,6 +70,48 @@ vpx_rc_status_t rc_create_model(void *priv,
return VPX_RC_OK;
}
+vpx_rc_status_t rc_create_model_gop(void *priv,
+ const vpx_rc_config_t *ratectrl_config,
+ vpx_rc_model_t *rate_ctrl_model_ptr) {
+ ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+ if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+ toy_rate_ctrl->magic_number = kModelMagicNumber;
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ toy_rate_ctrl->show_index = 0;
+ toy_rate_ctrl->coding_index = 0;
+ *rate_ctrl_model_ptr = toy_rate_ctrl;
+ EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+ EXPECT_EQ(ratectrl_config->frame_width, 640);
+ EXPECT_EQ(ratectrl_config->frame_height, 360);
+ EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
+ EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
+ EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+ EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_create_model_gop_short(
+ void *priv, const vpx_rc_config_t *ratectrl_config,
+ vpx_rc_model_t *rate_ctrl_model_ptr) {
+ ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+ if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+ toy_rate_ctrl->magic_number = kModelMagicNumber;
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ toy_rate_ctrl->show_index = 0;
+ toy_rate_ctrl->coding_index = 0;
+ *rate_ctrl_model_ptr = toy_rate_ctrl;
+ EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+ EXPECT_EQ(ratectrl_config->frame_width, 352);
+ EXPECT_EQ(ratectrl_config->frame_height, 288);
+ EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
+ EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
+ EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+ EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+ return VPX_RC_OK;
+}
+
vpx_rc_status_t rc_send_firstpass_stats(
vpx_rc_model_t rate_ctrl_model,
const vpx_rc_firstpass_stats_t *first_pass_stats) {
@@ -61,6 +125,32 @@ vpx_rc_status_t rc_send_firstpass_stats(
return VPX_RC_OK;
}
+vpx_rc_status_t rc_send_firstpass_stats_gop(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_firstpass_stats_t *first_pass_stats) {
+ const ToyRateCtrl *toy_rate_ctrl =
+ static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
+ for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+ EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+ }
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_firstpass_stats_gop_short(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_firstpass_stats_t *first_pass_stats) {
+ const ToyRateCtrl *toy_rate_ctrl =
+ static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
+ for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+ EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+ }
+ return VPX_RC_OK;
+}
+
vpx_rc_status_t rc_get_encodeframe_decision(
vpx_rc_model_t rate_ctrl_model,
const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -76,19 +166,17 @@ vpx_rc_status_t rc_get_encodeframe_decision(
if (encode_frame_info->coding_index == 0) {
EXPECT_EQ(encode_frame_info->show_index, 0);
EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
0); // kRefFrameTypeLast
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
0); // kRefFrameTypePast
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
0); // kRefFrameTypeFuture
- }
-
- if (encode_frame_info->coding_index == 1) {
+ } else if (encode_frame_info->coding_index == 1) {
EXPECT_EQ(encode_frame_info->show_index, 4);
EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
1); // kRefFrameTypeLast
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -97,19 +185,15 @@ vpx_rc_status_t rc_get_encodeframe_decision(
0); // kRefFrameTypeFuture
EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
0); // kRefFrameTypeLast
- }
-
- if (encode_frame_info->coding_index >= 2 &&
- encode_frame_info->coding_index < 5) {
+ } else if (encode_frame_info->coding_index >= 2 &&
+ encode_frame_info->coding_index < 5) {
// In the first group of pictures, coding_index and gop_index are equal.
EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
- EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
- }
-
- if (encode_frame_info->coding_index == 5) {
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ } else if (encode_frame_info->coding_index == 5) {
EXPECT_EQ(encode_frame_info->show_index, 4);
EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
1); // kRefFrameTypeLast
EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -133,6 +217,388 @@ vpx_rc_status_t rc_get_encodeframe_decision(
return VPX_RC_OK;
}
+vpx_rc_status_t rc_get_encodeframe_decision_gop(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_encodeframe_decision_t *frame_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
+ EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+ if (encode_frame_info->coding_index == 0) {
+ EXPECT_EQ(encode_frame_info->show_index, 0);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ } else if (encode_frame_info->coding_index == 1) {
+ EXPECT_EQ(encode_frame_info->show_index, 1);
+ EXPECT_EQ(encode_frame_info->gop_index, 1);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 1); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+ 0); // kRefFrameTypeLast
+ } else if (encode_frame_info->coding_index == 2) {
+ EXPECT_EQ(encode_frame_info->show_index, 2);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ } else if (encode_frame_info->coding_index == 3 ||
+ encode_frame_info->coding_index == 12 ||
+ encode_frame_info->coding_index == 21) {
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+ EXPECT_EQ(encode_frame_info->gop_index, 1);
+ } else if (encode_frame_info->coding_index == 11 ||
+ encode_frame_info->coding_index == 20 ||
+ encode_frame_info->coding_index == 29) {
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ } else if (encode_frame_info->coding_index >= 30) {
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ }
+
+ // When the model recommends an invalid q, valid range [0, 255],
+ // the encoder will ignore it and use the default q selected
+ // by libvpx rate control strategy.
+ frame_decision->q_index = VPX_DEFAULT_Q;
+ frame_decision->max_frame_size = 0;
+
+ toy_rate_ctrl->coding_index += 1;
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_encodeframe_decision_t *frame_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+ EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+ if (encode_frame_info->coding_index == 0) {
+ EXPECT_EQ(encode_frame_info->show_index, 0);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 1) {
+ EXPECT_EQ(encode_frame_info->show_index, 1);
+ EXPECT_EQ(encode_frame_info->gop_index, 1);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 1); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 2) {
+ EXPECT_EQ(encode_frame_info->show_index, 2);
+ EXPECT_EQ(encode_frame_info->gop_index, 2);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 3) {
+ EXPECT_EQ(encode_frame_info->show_index, 3);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+ }
+
+ // When the model recommends an invalid q, valid range [0, 255],
+ // the encoder will ignore it and use the default q selected
+ // by libvpx rate control strategy.
+ frame_decision->q_index = VPX_DEFAULT_Q;
+ frame_decision->max_frame_size = 0;
+
+ toy_rate_ctrl->coding_index += 1;
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_encodeframe_decision_t *frame_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+ EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+ if (encode_frame_info->coding_index == 0) {
+ EXPECT_EQ(encode_frame_info->show_index, 0);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 1) {
+ EXPECT_EQ(encode_frame_info->show_index, 3);
+ EXPECT_EQ(encode_frame_info->gop_index, 1);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 1); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 2) {
+ EXPECT_EQ(encode_frame_info->show_index, 1);
+ EXPECT_EQ(encode_frame_info->gop_index, 2);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 3) {
+ EXPECT_EQ(encode_frame_info->show_index, 2);
+ EXPECT_EQ(encode_frame_info->gop_index, 3);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 4) {
+ EXPECT_EQ(encode_frame_info->show_index, 3);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+ }
+
+ // When the model recommends an invalid q, valid range [0, 255],
+ // the encoder will ignore it and use the default q selected
+ // by libvpx rate control strategy.
+ frame_decision->q_index = VPX_DEFAULT_Q;
+ frame_decision->max_frame_size = 0;
+
+ toy_rate_ctrl->coding_index += 1;
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_encodeframe_decision_t *frame_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+ EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+ if (encode_frame_info->coding_index == 0) {
+ EXPECT_EQ(encode_frame_info->show_index, 0);
+ EXPECT_EQ(encode_frame_info->gop_index, 0);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 1) {
+ EXPECT_EQ(encode_frame_info->show_index, 1);
+ EXPECT_EQ(encode_frame_info->gop_index, 1);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+ 1); // kRefFrameTypeLast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+ 0); // kRefFrameTypePast
+ EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+ 0); // kRefFrameTypeFuture
+ EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+ 0); // kRefFrameTypeLast
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 2) {
+ EXPECT_EQ(encode_frame_info->show_index, 2);
+ EXPECT_EQ(encode_frame_info->gop_index, 2);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ } else if (encode_frame_info->coding_index == 3) {
+ EXPECT_EQ(encode_frame_info->show_index, 3);
+ EXPECT_EQ(encode_frame_info->gop_index, 3);
+ EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+ EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+ }
+
+ // When the model recommends an invalid q, valid range [0, 255],
+ // the encoder will ignore it and use the default q selected
+ // by libvpx rate control strategy.
+ frame_decision->q_index = VPX_DEFAULT_Q;
+ frame_decision->max_frame_size = 0;
+
+ toy_rate_ctrl->coding_index += 1;
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_gop_info_t *gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
+ EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+ EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+ EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
+ EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
+ EXPECT_EQ(gop_info->allow_alt_ref, 1);
+ if (gop_info->is_key_frame) {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ EXPECT_EQ(gop_info->frames_since_key, 0);
+ EXPECT_EQ(gop_info->gop_global_index, 0);
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ } else {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+ }
+ EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+ EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+ EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+ EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+ gop_decision->gop_coding_frames =
+ VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
+ gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
+ toy_rate_ctrl->frames_since_key +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ toy_rate_ctrl->show_index +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ ++toy_rate_ctrl->gop_global_index;
+ return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 3 coding frames, no alt ref.
+// The second GOP has 1 coding frame, no alt ref.
+vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_gop_info_t *gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+ EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+ EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+ EXPECT_EQ(gop_info->allow_alt_ref, 1);
+ if (gop_info->is_key_frame) {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ EXPECT_EQ(gop_info->frames_since_key, 0);
+ EXPECT_EQ(gop_info->gop_global_index, 0);
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ } else {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ }
+ EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+ EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+ EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+ EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+ gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
+ gop_decision->use_alt_ref = 0;
+ toy_rate_ctrl->frames_since_key +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ toy_rate_ctrl->show_index +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ ++toy_rate_ctrl->gop_global_index;
+ return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 4 coding frames. Use alt ref.
+// The second GOP only contains the overlay frame of the first GOP's alt ref
+// frame.
+vpx_rc_status_t rc_get_gop_decision_short_overlay(
+ vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+ EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+ EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+ EXPECT_EQ(gop_info->allow_alt_ref, 1);
+ if (gop_info->is_key_frame) {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ EXPECT_EQ(gop_info->frames_since_key, 0);
+ EXPECT_EQ(gop_info->gop_global_index, 0);
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ } else {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+ }
+ EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+ EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+ EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+ EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+ gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+ gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
+ toy_rate_ctrl->frames_since_key +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ toy_rate_ctrl->show_index +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ ++toy_rate_ctrl->gop_global_index;
+ return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 1 GOP.
+// The GOP has 4 coding frames. Do not use alt ref.
+vpx_rc_status_t rc_get_gop_decision_short_no_arf(
+ vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+ EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+ EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+ EXPECT_EQ(gop_info->allow_alt_ref, 1);
+ if (gop_info->is_key_frame) {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ EXPECT_EQ(gop_info->frames_since_key, 0);
+ EXPECT_EQ(gop_info->gop_global_index, 0);
+ toy_rate_ctrl->gop_global_index = 0;
+ toy_rate_ctrl->frames_since_key = 0;
+ } else {
+ EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+ }
+ EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+ EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+ EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+ EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+ gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+ gop_decision->use_alt_ref = 0;
+ toy_rate_ctrl->frames_since_key +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ toy_rate_ctrl->show_index +=
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+ ++toy_rate_ctrl->gop_global_index;
+ return VPX_RC_OK;
+}
+
vpx_rc_status_t rc_update_encodeframe_result(
vpx_rc_model_t rate_ctrl_model,
const vpx_rc_encodeframe_result_t *encode_frame_result) {
@@ -153,6 +619,43 @@ vpx_rc_status_t rc_update_encodeframe_result(
return VPX_RC_OK;
}
+vpx_rc_status_t rc_update_encodeframe_result_gop(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_result_t *encode_frame_result) {
+ const ToyRateCtrl *toy_rate_ctrl =
+ static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+ const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
+ EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_update_encodeframe_result_gop_short(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_result_t *encode_frame_result) {
+ const ToyRateCtrl *toy_rate_ctrl =
+ static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+ const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
+ EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+ return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_default_frame_rdmult(
+ vpx_rc_model_t rate_ctrl_model,
+ const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
+ const ToyRateCtrl *toy_rate_ctrl =
+ static_cast<ToyRateCtrl *>(rate_ctrl_model);
+ EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+ EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+ EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+ *rdmult = VPX_DEFAULT_RDMULT;
+ return VPX_RC_OK;
+}
+
vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
@@ -176,6 +679,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
::libvpx_test::Encoder *encoder) override {
if (video->frame() == 0) {
vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_QP;
rc_funcs.create_model = rc_create_model;
rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
@@ -195,8 +699,266 @@ TEST_F(ExtRateCtrlTest, EncodeTest) {
"bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
kFrameNum));
- ASSERT_NE(video.get(), nullptr);
+ ASSERT_NE(video, nullptr);
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
}
+class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+ ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
+
+ ~ExtRateCtrlTestGOP() override = default;
+
+ void SetUp() override {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ }
+
+ void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+ encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+ vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_GOP_QP;
+ rc_funcs.create_model = rc_create_model_gop;
+ rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+ rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
+ rc_funcs.get_gop_decision = rc_get_gop_decision;
+ rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
+ rc_funcs.delete_model = rc_delete_model;
+ rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+ encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ }
+ }
+};
+
+TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
+ cfg_.rc_target_bitrate = 4000;
+ cfg_.g_lag_in_frames = kMaxLagInFrames;
+ cfg_.rc_end_usage = VPX_VBR;
+
+ std::unique_ptr<libvpx_test::VideoSource> video;
+ video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+ "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
+ kFrameNumGOP));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+ ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
+
+ ~ExtRateCtrlTestGOPShort() override = default;
+
+ void SetUp() override {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ }
+
+ void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+ encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+ encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+ vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_GOP_QP;
+ rc_funcs.create_model = rc_create_model_gop_short;
+ rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+ rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+ rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+ rc_funcs.update_encodeframe_result =
+ rc_update_encodeframe_result_gop_short;
+ rc_funcs.delete_model = rc_delete_model;
+ rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+ encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ }
+ }
+};
+
+TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+ cfg_.rc_end_usage = VPX_VBR;
+
+ std::unique_ptr<libvpx_test::VideoSource> video;
+ video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+ kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortOverlay
+ : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+ ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
+
+ ~ExtRateCtrlTestGOPShortOverlay() override = default;
+
+ void SetUp() override {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ }
+
+ void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+ encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+ encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+ vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_GOP_QP;
+ rc_funcs.create_model = rc_create_model_gop_short;
+ rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+ rc_funcs.get_encodeframe_decision =
+ rc_get_encodeframe_decision_gop_short_overlay;
+ rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
+ rc_funcs.update_encodeframe_result =
+ rc_update_encodeframe_result_gop_short;
+ rc_funcs.delete_model = rc_delete_model;
+ rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+ encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ }
+ }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+ cfg_.rc_end_usage = VPX_VBR;
+
+ std::unique_ptr<libvpx_test::VideoSource> video;
+ video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+ kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortNoARF
+ : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+ ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
+
+ ~ExtRateCtrlTestGOPShortNoARF() override = default;
+
+ void SetUp() override {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ }
+
+ void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+ encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+ encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+ vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_GOP_QP;
+ rc_funcs.create_model = rc_create_model_gop_short;
+ rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+ rc_funcs.get_encodeframe_decision =
+ rc_get_encodeframe_decision_gop_short_no_arf;
+ rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
+ rc_funcs.update_encodeframe_result =
+ rc_update_encodeframe_result_gop_short;
+ rc_funcs.delete_model = rc_delete_model;
+ rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+ encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ }
+ }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+ cfg_.rc_end_usage = VPX_VBR;
+
+ std::unique_ptr<libvpx_test::VideoSource> video;
+ video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+ kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
+ public ::testing::Test {
+ protected:
+ ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
+
+ ~ExtRateCtrlTestRdmult() override = default;
+
+ void SetUp() override {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ vpx_rc_funcs_t rc_funcs;
+ rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
+ rc_funcs.create_model = rc_create_model_gop_short;
+ rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+ rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+ rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+ rc_funcs.update_encodeframe_result =
+ rc_update_encodeframe_result_gop_short;
+ rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
+ rc_funcs.delete_model = rc_delete_model;
+ rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+ encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+};
+
+TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+ cfg_.rc_end_usage = VPX_VBR;
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ std::unique_ptr<libvpx_test::VideoSource> video;
+ video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+ kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, kPsnrThreshold);
+}
+
} // namespace
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ca1062a76..587cec692 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
}
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+ int16_t *quant, int16_t *quant_shift,
+ int16_t *dequant, int16_t *round_fp,
+ int16_t *quant_fp) {
+ // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+ constexpr int kMaxQRoundingFactorFp = 64;
+
+ for (int j = 0; j < 2; j++) {
+ // The range is 4 to 1828 in the VP9 tables.
+ const int qlookup = rnd->RandRange(1825) + 4;
+ round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+ quant_fp[j] = (1 << 16) / qlookup;
+
+ // Values determined by deconstructing vp9_init_quantizer().
+ // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+ // values or U/V values of any bit depth. This is because y_delta is not
+ // factored into the vp9_ac_quant() call.
+ zbin[j] = rnd->RandRange(1200);
+
+ // round may be up to 685 for Y values or 914 for U/V.
+ round[j] = rnd->RandRange(914);
+ // quant ranges from 1 to -32703
+ quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+ // quant_shift goes up to 1 << 16.
+ quant_shift[j] = rnd->RandRange(16384);
+ // dequant maxes out at 1828 for all cases.
+ dequant[j] = rnd->RandRange(1828);
+ }
+ for (int j = 2; j < 8; j++) {
+ zbin[j] = zbin[1];
+ round_fp[j] = round_fp[1];
+ quant_fp[j] = quant_fp[1];
+ round[j] = round[1];
+ quant[j] = quant[1];
+ quant_shift[j] = quant_shift[1];
+ dequant[j] = dequant[1];
+ }
+}
+
class VP9QuantizeBase : public AbstractBench {
public:
VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
@@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
protected:
virtual void Run();
+ void Speed(bool is_median);
const QuantizeFunc quantize_op_;
const QuantizeFunc ref_quantize_op_;
};
@@ -159,6 +199,101 @@ void VP9QuantizeTest::Run() {
scan_->iscan);
}
+void VP9QuantizeTest::Speed(bool is_median) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ ASSERT_TRUE(coeff_.Init());
+ ASSERT_TRUE(qcoeff_.Init());
+ ASSERT_TRUE(dqcoeff_.Init());
+ TX_SIZE starting_sz, ending_sz;
+
+ if (max_size_ == 16) {
+ starting_sz = TX_4X4;
+ ending_sz = TX_16X16;
+ } else {
+ starting_sz = TX_32X32;
+ ending_sz = TX_32X32;
+ }
+
+ for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+ // zbin > coeff, zbin < coeff.
+ for (int i = 0; i < 2; ++i) {
+ // TX_TYPE defines the scan order. That is not relevant to the speed test.
+ // Pick the first one.
+ const TX_TYPE tx_type = DCT_DCT;
+ count_ = (4 << sz) * (4 << sz);
+ scan_ = &vp9_scan_orders[sz][tx_type];
+
+ GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+ quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+ quant_fp_ptr_);
+
+ if (i == 0) {
+ // When |coeff values| are less than zbin the results are 0.
+ int threshold = 100;
+ if (max_size_ == 32) {
+ // For 32x32, the threshold is halved. Double it to keep the values
+ // from clearing it.
+ threshold = 200;
+ }
+ for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+ coeff_.Set(&rnd, -99, 99);
+ } else if (i == 1) {
+ for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+ coeff_.Set(&rnd, -500, 500);
+ }
+
+ const char *type =
+ (i == 0) ? "Bypass calculations " : "Full calculations ";
+ char block_size[16];
+ snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+ char title[100];
+ snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
+ if (is_median) {
+ RunNTimes(10000000 / count_);
+ PrintMedian(title);
+ } else {
+ Buffer<tran_low_t> ref_qcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_qcoeff.Init());
+ Buffer<tran_low_t> ref_dqcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_dqcoeff.Init());
+ uint16_t ref_eob = 0;
+
+ const int kNumTests = 5000000;
+ vpx_usec_timer timer, simd_timer;
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
+ q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+ ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+ scan_->scan, scan_->iscan);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ vpx_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+ quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+ dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
+ scan_->scan, scan_->iscan);
+ }
+ vpx_usec_timer_mark(&simd_timer);
+
+ const int elapsed_time =
+ static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ const int simd_elapsed_time =
+ static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+ printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+ elapsed_time, simd_elapsed_time,
+ ((float)elapsed_time / simd_elapsed_time));
+ }
+ }
+ }
+}
+
// This quantizer compares the AC coefficients to the quantization step size to
// determine if further multiplication operations are needed.
// Based on vp9_quantize_fp_sse2().
@@ -254,45 +389,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
}
-void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
- int16_t *quant, int16_t *quant_shift,
- int16_t *dequant, int16_t *round_fp,
- int16_t *quant_fp) {
- // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
- const int max_qrounding_factor_fp = 64;
-
- for (int j = 0; j < 2; j++) {
- // The range is 4 to 1828 in the VP9 tables.
- const int qlookup = rnd->RandRange(1825) + 4;
- round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
- quant_fp[j] = (1 << 16) / qlookup;
-
- // Values determined by deconstructing vp9_init_quantizer().
- // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
- // values or U/V values of any bit depth. This is because y_delta is not
- // factored into the vp9_ac_quant() call.
- zbin[j] = rnd->RandRange(1200);
-
- // round may be up to 685 for Y values or 914 for U/V.
- round[j] = rnd->RandRange(914);
- // quant ranges from 1 to -32703
- quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
- // quant_shift goes up to 1 << 16.
- quant_shift[j] = rnd->RandRange(16384);
- // dequant maxes out at 1828 for all cases.
- dequant[j] = rnd->RandRange(1828);
- }
- for (int j = 2; j < 8; j++) {
- zbin[j] = zbin[1];
- round_fp[j] = round_fp[1];
- quant_fp[j] = quant_fp[1];
- round[j] = round[1];
- quant[j] = quant[1];
- quant_shift[j] = quant_shift[1];
- dequant[j] = dequant[1];
- }
-}
-
TEST_P(VP9QuantizeTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
ASSERT_TRUE(coeff_.Init());
@@ -403,60 +499,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
}
}
-TEST_P(VP9QuantizeTest, DISABLED_Speed) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- ASSERT_TRUE(coeff_.Init());
- ASSERT_TRUE(qcoeff_.Init());
- ASSERT_TRUE(dqcoeff_.Init());
- TX_SIZE starting_sz, ending_sz;
-
- if (max_size_ == 16) {
- starting_sz = TX_4X4;
- ending_sz = TX_16X16;
- } else {
- starting_sz = TX_32X32;
- ending_sz = TX_32X32;
- }
-
- for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
- // zbin > coeff, zbin < coeff.
- for (int i = 0; i < 2; ++i) {
- // TX_TYPE defines the scan order. That is not relevant to the speed test.
- // Pick the first one.
- const TX_TYPE tx_type = DCT_DCT;
- count_ = (4 << sz) * (4 << sz);
- scan_ = &vp9_scan_orders[sz][tx_type];
-
- GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
- quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
- quant_fp_ptr_);
-
- if (i == 0) {
- // When |coeff values| are less than zbin the results are 0.
- int threshold = 100;
- if (max_size_ == 32) {
- // For 32x32, the threshold is halved. Double it to keep the values
- // from clearing it.
- threshold = 200;
- }
- for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
- coeff_.Set(&rnd, -99, 99);
- } else if (i == 1) {
- for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
- coeff_.Set(&rnd, -500, 500);
- }
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
- RunNTimes(10000000 / count_);
- const char *type =
- (i == 0) ? "Bypass calculations " : "Full calculations ";
- char block_size[16];
- snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
- char title[100];
- snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
- PrintMedian(title);
- }
- }
-}
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
using std::make_tuple;
@@ -467,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+ &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
VPX_BITS_8, 16, false),
make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
@@ -492,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P(
#endif // HAVE_SSE2
#if HAVE_SSSE3
-#if VPX_ARCH_X86_64
INSTANTIATE_TEST_SUITE_P(
SSSE3, VP9QuantizeTest,
::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -506,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
&QuantFPWrapper<quantize_fp_32x32_nz_c>,
VPX_BITS_8, 32, true)));
-#else
-INSTANTIATE_TEST_SUITE_P(
- SSSE3, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_ssse3,
- &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
- false)));
-
-#endif // VPX_ARCH_X86_64
#endif // HAVE_SSSE3
#if HAVE_AVX
@@ -529,14 +565,78 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
#endif // HAVE_AVX
#if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, VP9QuantizeTest,
+ ::testing::Values(
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+ &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+ make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+ &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+ true),
+ make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+ &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+ 32, true),
+ make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+ false),
+ make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_10, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_12, 16, false),
+ make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+ VPX_BITS_8, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+#else
INSTANTIATE_TEST_SUITE_P(
AVX2, VP9QuantizeTest,
::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
&QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
- 16, true)));
+ 16, true),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+ &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+ VPX_BITS_8, 32, true),
+ make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&vpx_quantize_b_32x32_avx2,
+ &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+ false)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_AVX2
#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VP9QuantizeTest,
+ ::testing::Values(
+ make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+ false),
+ make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+ VPX_BITS_10, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+ VPX_BITS_12, 16, false),
+ make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+ VPX_BITS_8, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+ &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+ &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+ true)));
+#else
INSTANTIATE_TEST_SUITE_P(
NEON, VP9QuantizeTest,
::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -550,6 +650,7 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
VPX_BITS_8, 32, true)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index b09a45bb7..1d1a78f43 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -26,7 +26,11 @@ namespace {
const size_t kNumFrames = 300;
-const int kTemporalId[4] = { 0, 2, 1, 2 };
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
class RcInterfaceTest
: public ::libvpx_test::EncoderTest,
@@ -179,28 +183,73 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
encoder->Control(VP9E_SET_SVC, 1);
encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
}
-
- frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
- if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
- // Disable golden frame update.
- frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
- frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
- }
+ frame_params_.frame_type =
+ video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
encoder_exit_ = video->frame() == kNumFrames;
current_superframe_ = video->frame();
+ if (dynamic_spatial_layers_ == 1) {
+ if (video->frame() == 100) {
+ // Go down to 2 spatial layers: set top SL to 0 bitrate.
+ // Update the encoder config.
+ cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+ cfg_.layer_target_bitrate[6] = 0;
+ cfg_.layer_target_bitrate[7] = 0;
+ cfg_.layer_target_bitrate[8] = 0;
+ encoder->Config(&cfg_);
+ // Update the RC config.
+ rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8];
+ rc_cfg_.layer_target_bitrate[6] = 0;
+ rc_cfg_.layer_target_bitrate[7] = 0;
+ rc_cfg_.layer_target_bitrate[8] = 0;
+ rc_api_->UpdateRateControl(rc_cfg_);
+ } else if (video->frame() == 200) {
+ // Go down to 1 spatial layer.
+ // Update the encoder config.
+ cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5];
+ cfg_.layer_target_bitrate[3] = 0;
+ cfg_.layer_target_bitrate[4] = 0;
+ cfg_.layer_target_bitrate[5] = 0;
+ encoder->Config(&cfg_);
+ // Update the RC config.
+ rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5];
+ rc_cfg_.layer_target_bitrate[3] = 0;
+ rc_cfg_.layer_target_bitrate[4] = 0;
+ rc_cfg_.layer_target_bitrate[5] = 0;
+ rc_api_->UpdateRateControl(rc_cfg_);
+ } else if (0 && video->frame() == 280) {
+ // TODO(marpan): Re-enable this going back up when issue is fixed.
+ // Go back up to 3 spatial layers.
+ // Update the encoder config: use the original bitrates.
+ SetEncoderConfigSvc(3, 3);
+ encoder->Config(&cfg_);
+ // Update the RC config.
+ SetRCConfigSvc(3, 3);
+ rc_api_->UpdateRateControl(rc_cfg_);
+ }
+ }
}
virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+ for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
pkt->data.frame.sz);
for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
- frame_params_.spatial_layer_id = sl;
- frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4];
- rc_api_->ComputeQP(frame_params_);
- frame_params_.frame_type = INTER_FRAME;
- rc_api_->PostEncodeUpdate(sizes_[sl]);
+ if (sizes_[sl] > 0) {
+ frame_params_.spatial_layer_id = sl;
+ if (rc_cfg_.ts_number_layers == 3)
+ frame_params_.temporal_layer_id =
+ kTemporalId3Layer[current_superframe_ % 4];
+ else if (rc_cfg_.ts_number_layers == 2)
+ frame_params_.temporal_layer_id =
+ kTemporalId2Layer[current_superframe_ % 2];
+ else
+ frame_params_.temporal_layer_id = 0;
+ rc_api_->ComputeQP(frame_params_);
+ frame_params_.frame_type = INTER_FRAME;
+ rc_api_->PostEncodeUpdate(sizes_[sl]);
+ }
}
}
if (!encoder_exit_) {
@@ -218,9 +267,37 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
const vpx_image_t * /*img2*/) {}
void RunSvc() {
- SetConfigSvc();
+ dynamic_spatial_layers_ = 0;
+ SetRCConfigSvc(3, 3);
+ key_interval_ = 10000;
+ rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+ SetEncoderConfigSvc(3, 3);
+
+ ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+ 1280, 720, 30, 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvcPeriodicKey() {
+ dynamic_spatial_layers_ = 0;
+ SetRCConfigSvc(3, 3);
+ key_interval_ = 100;
+ rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+ SetEncoderConfigSvc(3, 3);
+
+ ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+ 1280, 720, 30, 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvcDynamicSpatial() {
+ dynamic_spatial_layers_ = 1;
+ SetRCConfigSvc(3, 3);
+ key_interval_ = 10000;
rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
- SetEncoderSvc();
+ SetEncoderConfigSvc(3, 3);
::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
1280, 720, 30, 1, 0, kNumFrames);
@@ -256,30 +333,54 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
return VPX_CODEC_OK;
}
- void SetEncoderSvc() {
- cfg_.ss_number_layers = 3;
- cfg_.ts_number_layers = 3;
+ void SetEncoderConfigSvc(int number_spatial_layers,
+ int number_temporal_layers) {
+ cfg_.g_w = 1280;
+ cfg_.g_h = 720;
+ cfg_.ss_number_layers = number_spatial_layers;
+ cfg_.ts_number_layers = number_temporal_layers;
cfg_.g_timebase.num = 1;
cfg_.g_timebase.den = 30;
- svc_params_.scaling_factor_num[0] = 72;
- svc_params_.scaling_factor_den[0] = 288;
- svc_params_.scaling_factor_num[1] = 144;
- svc_params_.scaling_factor_den[1] = 288;
- svc_params_.scaling_factor_num[2] = 288;
- svc_params_.scaling_factor_den[2] = 288;
+ if (number_spatial_layers == 3) {
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 4;
+ svc_params_.scaling_factor_num[1] = 2;
+ svc_params_.scaling_factor_den[1] = 4;
+ svc_params_.scaling_factor_num[2] = 4;
+ svc_params_.scaling_factor_den[2] = 4;
+ } else if (number_spatial_layers == 2) {
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 2;
+ svc_params_.scaling_factor_num[1] = 2;
+ svc_params_.scaling_factor_den[1] = 2;
+ } else if (number_spatial_layers == 1) {
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 1;
+ }
+
for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
svc_params_.max_quantizers[i] = 56;
svc_params_.min_quantizers[i] = 2;
svc_params_.speed_per_layer[i] = 7;
+ svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL;
}
cfg_.rc_end_usage = VPX_CBR;
cfg_.g_lag_in_frames = 0;
cfg_.g_error_resilient = 0;
- // 3 temporal layers
- cfg_.ts_rate_decimator[0] = 4;
- cfg_.ts_rate_decimator[1] = 2;
- cfg_.ts_rate_decimator[2] = 1;
- cfg_.temporal_layering_mode = 3;
+
+ if (number_temporal_layers == 3) {
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+ cfg_.temporal_layering_mode = 3;
+ } else if (number_temporal_layers == 2) {
+ cfg_.ts_rate_decimator[0] = 2;
+ cfg_.ts_rate_decimator[1] = 1;
+ cfg_.temporal_layering_mode = 2;
+ } else if (number_temporal_layers == 1) {
+ cfg_.ts_rate_decimator[0] = 1;
+ cfg_.temporal_layering_mode = 0;
+ }
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 600;
@@ -288,27 +389,39 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
cfg_.rc_max_quantizer = 56;
cfg_.g_threads = 1;
cfg_.kf_max_dist = 9999;
- cfg_.rc_target_bitrate = 1600;
cfg_.rc_overshoot_pct = 50;
cfg_.rc_undershoot_pct = 50;
- cfg_.layer_target_bitrate[0] = 100;
- cfg_.layer_target_bitrate[1] = 140;
- cfg_.layer_target_bitrate[2] = 200;
- cfg_.layer_target_bitrate[3] = 250;
- cfg_.layer_target_bitrate[4] = 350;
- cfg_.layer_target_bitrate[5] = 500;
- cfg_.layer_target_bitrate[6] = 450;
- cfg_.layer_target_bitrate[7] = 630;
- cfg_.layer_target_bitrate[8] = 900;
+ cfg_.rc_target_bitrate = 0;
+ for (int sl = 0; sl < number_spatial_layers; sl++) {
+ int spatial_bitrate = 0;
+ if (number_spatial_layers <= 3)
+ spatial_bitrate = kSpatialLayerBitrate[sl];
+ for (int tl = 0; tl < number_temporal_layers; tl++) {
+ int layer = sl * number_temporal_layers + tl;
+ if (number_temporal_layers == 3)
+ cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+ else if (number_temporal_layers == 2)
+ cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+ else if (number_temporal_layers == 1)
+ cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+ }
+ cfg_.rc_target_bitrate += spatial_bitrate;
+ }
+
+ cfg_.kf_min_dist = key_interval_;
+ cfg_.kf_max_dist = key_interval_;
}
- void SetConfigSvc() {
+ void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) {
rc_cfg_.width = 1280;
rc_cfg_.height = 720;
+ rc_cfg_.ss_number_layers = number_spatial_layers;
+ rc_cfg_.ts_number_layers = number_temporal_layers;
rc_cfg_.max_quantizer = 56;
rc_cfg_.min_quantizer = 2;
- rc_cfg_.target_bandwidth = 1600;
rc_cfg_.buf_initial_sz = 500;
rc_cfg_.buf_optimal_sz = 600;
rc_cfg_.buf_sz = 1000;
@@ -316,31 +429,55 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
rc_cfg_.overshoot_pct = 50;
rc_cfg_.max_intra_bitrate_pct = 900;
rc_cfg_.framerate = 30.0;
- rc_cfg_.ss_number_layers = 3;
- rc_cfg_.ts_number_layers = 3;
rc_cfg_.rc_mode = VPX_CBR;
rc_cfg_.aq_mode = aq_mode_;
- rc_cfg_.scaling_factor_num[0] = 1;
- rc_cfg_.scaling_factor_den[0] = 4;
- rc_cfg_.scaling_factor_num[1] = 2;
- rc_cfg_.scaling_factor_den[1] = 4;
- rc_cfg_.scaling_factor_num[2] = 4;
- rc_cfg_.scaling_factor_den[2] = 4;
-
- rc_cfg_.ts_rate_decimator[0] = 4;
- rc_cfg_.ts_rate_decimator[1] = 2;
- rc_cfg_.ts_rate_decimator[2] = 1;
-
- rc_cfg_.layer_target_bitrate[0] = 100;
- rc_cfg_.layer_target_bitrate[1] = 140;
- rc_cfg_.layer_target_bitrate[2] = 200;
- rc_cfg_.layer_target_bitrate[3] = 250;
- rc_cfg_.layer_target_bitrate[4] = 350;
- rc_cfg_.layer_target_bitrate[5] = 500;
- rc_cfg_.layer_target_bitrate[6] = 450;
- rc_cfg_.layer_target_bitrate[7] = 630;
- rc_cfg_.layer_target_bitrate[8] = 900;
+ if (number_spatial_layers == 3) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 4;
+ rc_cfg_.scaling_factor_num[1] = 2;
+ rc_cfg_.scaling_factor_den[1] = 4;
+ rc_cfg_.scaling_factor_num[2] = 4;
+ rc_cfg_.scaling_factor_den[2] = 4;
+ } else if (number_spatial_layers == 2) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 2;
+ rc_cfg_.scaling_factor_num[1] = 2;
+ rc_cfg_.scaling_factor_den[1] = 2;
+ } else if (number_spatial_layers == 1) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 1;
+ }
+
+ if (number_temporal_layers == 3) {
+ rc_cfg_.ts_rate_decimator[0] = 4;
+ rc_cfg_.ts_rate_decimator[1] = 2;
+ rc_cfg_.ts_rate_decimator[2] = 1;
+ } else if (number_temporal_layers == 2) {
+ rc_cfg_.ts_rate_decimator[0] = 2;
+ rc_cfg_.ts_rate_decimator[1] = 1;
+ } else if (number_temporal_layers == 1) {
+ rc_cfg_.ts_rate_decimator[0] = 1;
+ }
+
+ rc_cfg_.target_bandwidth = 0;
+ for (int sl = 0; sl < number_spatial_layers; sl++) {
+ int spatial_bitrate = 0;
+ if (number_spatial_layers <= 3)
+ spatial_bitrate = kSpatialLayerBitrate[sl];
+ for (int tl = 0; tl < number_temporal_layers; tl++) {
+ int layer = sl * number_temporal_layers + tl;
+ if (number_temporal_layers == 3)
+ rc_cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+ else if (number_temporal_layers == 2)
+ rc_cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+ else if (number_temporal_layers == 1)
+ rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+ }
+ rc_cfg_.target_bandwidth += spatial_bitrate;
+ }
for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
@@ -359,6 +496,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
bool encoder_exit_;
int current_superframe_;
uint32_t sizes_[8];
+ int key_interval_;
+ int dynamic_spatial_layers_;
};
TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
@@ -367,6 +506,10 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
+TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
::testing::Values(VPX_CBR, VPX_VBR));
VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 211cc6c7a..a57082f1e 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <tuple>
#include "third_party/googletest/src/include/gtest/gtest.h"
@@ -17,9 +18,11 @@
#include "test/bench.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
+#include "test/util.h"
#include "vp9/common/vp9_blockd.h"
#include "vpx_ports/msvc.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
ptrdiff_t diff_stride, const uint8_t *src_ptr,
@@ -133,6 +136,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest,
INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_sse2));
#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest,
+ ::testing::Values(vpx_subtract_block_avx2));
+#endif
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_neon));
@@ -157,4 +164,158 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_lsx));
#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride, int bd);
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
+
+class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+ virtual void SetUp() {
+ block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
+ block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
+ bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
+ func_ = GET_PARAM(2);
+ ref_func_ = GET_PARAM(3);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+
+ constexpr size_t kMaxWidth = 128;
+ constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth;
+ src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+ ASSERT_NE(src_, nullptr);
+ pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+ ASSERT_NE(pred_, nullptr);
+ diff_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, kMaxBlockSize * sizeof(int16_t)));
+ ASSERT_NE(diff_, nullptr);
+ }
+
+ virtual void TearDown() {
+ vpx_free(CONVERT_TO_SHORTPTR(src_));
+ vpx_free(CONVERT_TO_SHORTPTR(pred_));
+ vpx_free(diff_);
+ }
+
+ protected:
+ void CheckResult();
+ void RunForSpeed();
+
+ private:
+ ACMRandom rnd_;
+ int block_height_;
+ int block_width_;
+ vpx_bit_depth_t bit_depth_;
+ HBDSubtractFunc func_;
+ HBDSubtractFunc ref_func_;
+ uint8_t *src_;
+ uint8_t *pred_;
+ int16_t *diff_;
+};
+
+void VPXHBDSubtractBlockTest::CheckResult() {
+ constexpr int kTestNum = 100;
+ constexpr int kMaxWidth = 128;
+ constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+ const int mask = (1 << bit_depth_) - 1;
+ for (int i = 0; i < kTestNum; ++i) {
+ for (int j = 0; j < kMaxBlockSize; ++j) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+ }
+
+ func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+ pred_, block_width_, bit_depth_);
+
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ + c],
+ (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+ CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+ << "r = " << r << ", c = " << c << ", test: " << i;
+ }
+ }
+ }
+}
+
+TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void VPXHBDSubtractBlockTest::RunForSpeed() {
+ constexpr int kTestNum = 200000;
+ constexpr int kMaxWidth = 128;
+ constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+ const int mask = (1 << bit_depth_) - 1;
+
+ if (ref_func_ == func_) GTEST_SKIP();
+
+ for (int j = 0; j < kMaxBlockSize; ++j) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+ }
+
+ vpx_usec_timer ref_timer;
+ vpx_usec_timer_start(&ref_timer);
+ for (int i = 0; i < kTestNum; ++i) {
+ ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+ block_width_, pred_, block_width_, bit_depth_);
+ }
+ vpx_usec_timer_mark(&ref_timer);
+ const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer);
+
+ for (int j = 0; j < kMaxBlockSize; ++j) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+ }
+
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kTestNum; ++i) {
+ func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+ pred_, block_width_, bit_depth_);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer);
+
+ printf(
+ "[%dx%d]: "
+ "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+ " \t "
+ "gain=%f \n",
+ block_width_, block_height_, ref_elapsed_time, elapsed_time,
+ static_cast<double>(ref_elapsed_time) /
+ static_cast<double>(elapsed_time));
+}
+
+TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+ BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+ BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+ BLOCK_64X64 };
+
+INSTANTIATE_TEST_SUITE_P(
+ C, VPXHBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&vpx_highbd_subtract_block_c),
+ ::testing::Values(&vpx_highbd_subtract_block_c)));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, VPXHBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&vpx_highbd_subtract_block_avx2),
+ ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif // HAVE_AVX2
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace vp9
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index b9a74922f..5f6b01b0e 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
URL: https://github.com/google/googletest.git
-Version: release-1.11.0
+Version: release-1.12.1
License: BSD
License File: LICENSE
@@ -13,9 +13,17 @@ generation.
Local Modifications:
- Remove everything but:
+ .clang-format
CONTRIBUTORS
googletest/
include
README.md
src
LICENSE
+- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+ GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+ the mingw32 g++ compilation errors caused by the lack of std::mutex
+ and std::condition_variable in the <mutex> and <condition_variable>
+ headers if mingw32 is configured with the win32 threads option. See
+ https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/third_party/googletest/src/.clang-format b/third_party/googletest/src/.clang-format
new file mode 100644
index 000000000..5b9bfe6d2
--- /dev/null
+++ b/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language: Cpp
+BasedOnStyle: Google
diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS
index 76db0b40f..77397a5b5 100644
--- a/third_party/googletest/src/CONTRIBUTORS
+++ b/third_party/googletest/src/CONTRIBUTORS
@@ -34,6 +34,7 @@ Manuel Klimek <klimek@google.com>
Mario Tanev <radix@google.com>
Mark Paskin
Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
Matthew Simmons <simmonmt@acm.org>
Mika Raento <mikie@iki.fi>
Mike Bland <mbland@google.com>
@@ -55,6 +56,7 @@ Russ Rufer <russ@pentad.com>
Sean Mcafee <eefacm@gmail.com>
Sigurður Ásgeirsson <siggi@google.com>
Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
Takeshi Yoshino <tyoshino@google.com>
Tracy Bialik <tracy@pentad.com>
Vadim Berman <vadimb@google.com>
diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md
index 1f8b349ae..d26b309ed 100644
--- a/third_party/googletest/src/README.md
+++ b/third_party/googletest/src/README.md
@@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts
with
```
-git clone https://github.com/google/googletest.git -b release-1.10.0
+git clone https://github.com/google/googletest.git -b release-1.11.0
cd googletest # Main directory of the cloned repository.
mkdir build # Create a directory to hold the build output.
cd build
@@ -94,7 +94,7 @@ include(FetchContent)
FetchContent_Declare(
googletest
# Specify the commit you depend on and update it regularly.
- URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+ URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -203,7 +203,9 @@ add
-DGTEST_DONT_DEFINE_FOO=1
to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
-to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
GTEST_TEST(SomeTest, DoesThis) { ... }
diff --git a/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
new file mode 100644
index 000000000..addbb59c6
--- /dev/null
+++ b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful. When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+// 1. Defining predicate functions to be used with Boolean test assertions
+// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+// 2. Defining predicate-format functions to be
+// used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+// testing::AssertionResult IsEven(int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess();
+// else
+// return testing::AssertionFailure() << n << " is odd";
+// }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+// Value of: IsEven(Fib(5))
+// Actual: false (5 is odd)
+// Expected: true
+//
+// instead of a more opaque
+//
+// Value of: IsEven(Fib(5))
+// Actual: false
+// Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+// testing::AssertionResult IsEven(int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess() << n << " is even";
+// else
+// return testing::AssertionFailure() << n << " is odd";
+// }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+// Value of: IsEven(Fib(6))
+// Actual: true (8 is even)
+// Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+// // Verifies that Foo() returns an even number.
+// EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+// testing::AssertionResult IsEven(const char* expr, int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess();
+// else
+// return testing::AssertionFailure()
+// << "Expected: " << expr << " is even\n Actual: it's " << n;
+// }
+//
+// If Foo() returns 5, you will see the following message:
+//
+// Expected: Foo() is even
+// Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+ // Copy constructor.
+ // Used in EXPECT_TRUE/FALSE(assertion_result).
+ AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+ // Used in the EXPECT_TRUE/FALSE(bool_expression).
+ //
+ // T must be contextually convertible to bool.
+ //
+ // The second parameter prevents this overload from being considered if
+ // the argument is implicitly convertible to AssertionResult. In that case
+ // we want AssertionResult's copy constructor to be used.
+ template <typename T>
+ explicit AssertionResult(
+ const T& success,
+ typename std::enable_if<
+ !std::is_convertible<T, AssertionResult>::value>::type*
+ /*enabler*/
+ = nullptr)
+ : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+ GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+ // Assignment operator.
+ AssertionResult& operator=(AssertionResult other) {
+ swap(other);
+ return *this;
+ }
+
+ // Returns true if and only if the assertion succeeded.
+ operator bool() const { return success_; } // NOLINT
+
+ // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+ AssertionResult operator!() const;
+
+ // Returns the text streamed into this AssertionResult. Test assertions
+ // use it when they fail (i.e., the predicate's outcome doesn't match the
+ // assertion's expectation). When nothing has been streamed into the
+ // object, returns an empty string.
+ const char* message() const {
+ return message_.get() != nullptr ? message_->c_str() : "";
+ }
+ // Deprecated; please use message() instead.
+ const char* failure_message() const { return message(); }
+
+ // Streams a custom failure message into this object.
+ template <typename T>
+ AssertionResult& operator<<(const T& value) {
+ AppendMessage(Message() << value);
+ return *this;
+ }
+
+ // Allows streaming basic output manipulators such as endl or flush into
+ // this object.
+ AssertionResult& operator<<(
+ ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+ AppendMessage(Message() << basic_manipulator);
+ return *this;
+ }
+
+ private:
+ // Appends the contents of message to message_.
+ void AppendMessage(const Message& a_message) {
+ if (message_.get() == nullptr) message_.reset(new ::std::string);
+ message_->append(a_message.GetString().c_str());
+ }
+
+ // Swap the contents of this AssertionResult with other.
+ void swap(AssertionResult& other);
+
+ // Stores result of the assertion predicate.
+ bool success_;
+ // Stores the message describing the condition in case the expectation
+ // construct is not satisfied with the predicate's outcome.
+ // Referenced via a pointer to avoid taking too much stack frame space
+ // with test assertions.
+ std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h
index 9b4d4d133..84e5a5bbd 100644
--- a/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -27,21 +27,21 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file defines the public API for death tests. It is
// #included by gtest.h so a user doesn't need to include this
// directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
#include "gtest/internal/gtest-death-test-internal.h"
-namespace testing {
-
// This flag controls the style of death tests. Valid values are "threadsafe",
// meaning that the death test child process will re-execute the test binary
// from the start, running only a single death test, or "fast",
@@ -49,6 +49,8 @@ namespace testing {
// after forking.
GTEST_DECLARE_string_(death_test_style);
+namespace testing {
+
#if GTEST_HAS_DEATH_TEST
namespace internal {
@@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild();
//
// On the regular expressions used in death tests:
//
-// GOOGLETEST_CM0005 DO NOT DELETE
// On POSIX-compliant systems (*nix), we use the <regex.h> library,
// which uses the POSIX extended regex syntax.
//
@@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild();
// Asserts that a given `statement` causes the program to exit, with an
// integer exit status that satisfies `predicate`, and emitting error output
// that matches `matcher`.
-# define ASSERT_EXIT(statement, predicate, matcher) \
- GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+#define ASSERT_EXIT(statement, predicate, matcher) \
+ GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
// Like `ASSERT_EXIT`, but continues on to successive tests in the
// test suite, if any:
-# define EXPECT_EXIT(statement, predicate, matcher) \
- GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_EXIT(statement, predicate, matcher) \
+ GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
// Asserts that a given `statement` causes the program to exit, either by
// explicitly exiting with a nonzero exit code or being killed by a
// signal, and emitting error output that matches `matcher`.
-# define ASSERT_DEATH(statement, matcher) \
- ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define ASSERT_DEATH(statement, matcher) \
+ ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
// Like `ASSERT_DEATH`, but continues on to successive tests in the
// test suite, if any:
-# define EXPECT_DEATH(statement, matcher) \
- EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define EXPECT_DEATH(statement, matcher) \
+ EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
@@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode {
ExitedWithCode(const ExitedWithCode&) = default;
void operator=(const ExitedWithCode& other) = delete;
bool operator()(int exit_status) const;
+
private:
const int exit_code_;
};
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
// Tests that an exit code describes an exit due to termination by a
// given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
class GTEST_API_ KilledBySignal {
public:
explicit KilledBySignal(int signum);
bool operator()(int exit_status) const;
+
private:
const int signum_;
};
-# endif // !GTEST_OS_WINDOWS
+#endif // !GTEST_OS_WINDOWS
// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
// The death testing framework causes this to have interesting semantics,
@@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal {
// EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
// }, "death");
//
-# ifdef NDEBUG
+#ifdef NDEBUG
-# define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
GTEST_EXECUTE_STATEMENT_(statement, regex)
-# define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
GTEST_EXECUTE_STATEMENT_(statement, regex)
-# else
+#else
-# define EXPECT_DEBUG_DEATH(statement, regex) \
- EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
-# define ASSERT_DEBUG_DEATH(statement, regex) \
- ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
-# endif // NDEBUG for EXPECT_DEBUG_DEATH
+#endif // NDEBUG for EXPECT_DEBUG_DEATH
#endif // GTEST_HAS_DEATH_TEST
// This macro is used for implementing macros such as
@@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal {
// statement unconditionally returns or throws. The Message constructor at
// the end allows the syntax of streaming additional messages into the
// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (::testing::internal::AlwaysTrue()) { \
- GTEST_LOG_(WARNING) \
- << "Death tests are not supported on this platform.\n" \
- << "Statement '" #statement "' cannot be verified."; \
- } else if (::testing::internal::AlwaysFalse()) { \
- ::testing::internal::RE::PartialMatch(".*", (regex)); \
- GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
- terminator; \
- } else \
- ::testing::Message()
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+ << "Statement '" #statement "' cannot be verified."; \
+ } else if (::testing::internal::AlwaysFalse()) { \
+ ::testing::internal::RE::PartialMatch(".*", (regex)); \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ terminator; \
+ } else \
+ ::testing::Message()
// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal {
// useful when you are combining death test assertions with normal test
// assertions in one test.
#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
- EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
- ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+ EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+ ASSERT_DEATH(statement, regex)
#else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
- GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
- GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+ GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+ GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
#endif
} // namespace testing
diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h
index 9fa34a05b..bffa00c53 100644
--- a/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,6 +32,10 @@
// This file implements just enough of the matcher interface to allow
// EXPECT_DEATH and friends to accept a matcher argument.
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
@@ -98,11 +102,11 @@ class MatchResultListener {
private:
::std::ostream* const stream_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+ MatchResultListener(const MatchResultListener&) = delete;
+ MatchResultListener& operator=(const MatchResultListener&) = delete;
};
-inline MatchResultListener::~MatchResultListener() {
-}
+inline MatchResultListener::~MatchResultListener() {}
// An instance of a subclass of this knows how to describe itself as a
// matcher.
@@ -176,27 +180,39 @@ namespace internal {
struct AnyEq {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a == b; }
+ bool operator()(const A& a, const B& b) const {
+ return a == b;
+ }
};
struct AnyNe {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a != b; }
+ bool operator()(const A& a, const B& b) const {
+ return a != b;
+ }
};
struct AnyLt {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a < b; }
+ bool operator()(const A& a, const B& b) const {
+ return a < b;
+ }
};
struct AnyGt {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a > b; }
+ bool operator()(const A& a, const B& b) const {
+ return a > b;
+ }
};
struct AnyLe {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a <= b; }
+ bool operator()(const A& a, const B& b) const {
+ return a <= b;
+ }
};
struct AnyGe {
template <typename A, typename B>
- bool operator()(const A& a, const B& b) const { return a >= b; }
+ bool operator()(const A& a, const B& b) const {
+ return a >= b;
+ }
};
// A match result listener that ignores the explanation.
@@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener {
DummyMatchResultListener() : MatchResultListener(nullptr) {}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+ DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+ DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
};
// A match result listener that forwards the explanation to a given
@@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener {
: MatchResultListener(os) {}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+ StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+ StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+ delete;
};
struct SharedPayloadBase {
@@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface {
}
protected:
- MatcherBase() : vtable_(nullptr) {}
+ MatcherBase() : vtable_(nullptr), buffer_() {}
// Constructs a matcher from its implementation.
template <typename U>
- explicit MatcherBase(const MatcherInterface<U>* impl) {
+ explicit MatcherBase(const MatcherInterface<U>* impl)
+ : vtable_(nullptr), buffer_() {
Init(impl);
}
template <typename M, typename = typename std::remove_reference<
M>::type::is_gtest_matcher>
- MatcherBase(M&& m) { // NOLINT
+ MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT
Init(std::forward<M>(m));
}
@@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface {
static const M& Get(const MatcherBase& m) {
// When inlined along with Init, need to be explicit to avoid violating
// strict aliasing rules.
- const M *ptr = static_cast<const M*>(
- static_cast<const void*>(&m.buffer_));
+ const M* ptr =
+ static_cast<const M*>(static_cast<const void*>(&m.buffer_));
return *ptr;
}
static void Init(MatcherBase& m, M impl) {
@@ -741,7 +761,7 @@ template <typename Rhs>
class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
public:
explicit EqMatcher(const Rhs& rhs)
- : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+ : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
static const char* Desc() { return "is equal to"; }
static const char* NegatedDesc() { return "isn't equal to"; }
};
@@ -749,7 +769,7 @@ template <typename Rhs>
class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
public:
explicit NeMatcher(const Rhs& rhs)
- : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+ : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
static const char* Desc() { return "isn't equal to"; }
static const char* NegatedDesc() { return "is equal to"; }
};
@@ -757,7 +777,7 @@ template <typename Rhs>
class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
public:
explicit LtMatcher(const Rhs& rhs)
- : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+ : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
static const char* Desc() { return "is <"; }
static const char* NegatedDesc() { return "isn't <"; }
};
@@ -765,7 +785,7 @@ template <typename Rhs>
class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
public:
explicit GtMatcher(const Rhs& rhs)
- : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+ : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
static const char* Desc() { return "is >"; }
static const char* NegatedDesc() { return "isn't >"; }
};
@@ -773,7 +793,7 @@ template <typename Rhs>
class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
public:
explicit LeMatcher(const Rhs& rhs)
- : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+ : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
static const char* Desc() { return "is <="; }
static const char* NegatedDesc() { return "isn't <="; }
};
@@ -781,7 +801,7 @@ template <typename Rhs>
class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
public:
explicit GeMatcher(const Rhs& rhs)
- : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+ : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
static const char* Desc() { return "is >="; }
static const char* NegatedDesc() { return "isn't >="; }
};
@@ -872,12 +892,16 @@ PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
// wouldn't compile.
template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+inline internal::EqMatcher<T> Eq(T x) {
+ return internal::EqMatcher<T>(x);
+}
// Constructs a Matcher<T> from a 'value' of type T. The constructed
// matcher matches any value that's equal to 'value'.
template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
+Matcher<T>::Matcher(T value) {
+ *this = Eq(value);
+}
// Creates a monomorphic matcher that matches anything with type Lhs
// and equal to rhs. A user may need to use this instead of Eq(...)
@@ -892,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); }
// can always write Matcher<T>(Lt(5)) to be explicit about the type,
// for example.
template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+ return Eq(rhs);
+}
// Creates a polymorphic matcher that matches anything >= x.
template <typename Rhs>
diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h
index becfd49fc..6c8bf9000 100644
--- a/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/third_party/googletest/src/include/gtest/gtest-message.h
@@ -27,7 +27,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file defines the Message class.
@@ -42,7 +41,9 @@
// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user
// program!
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
@@ -110,8 +111,8 @@ class GTEST_API_ Message {
// Streams a non-pointer value to this object.
template <typename T>
- inline Message& operator <<(const T& val) {
- // Some libraries overload << for STL containers. These
+ inline Message& operator<<(const T& val) {
+ // Some libraries overload << for STL containers. These
// overloads are defined in the global namespace instead of ::std.
//
// C++'s symbol lookup rule (i.e. Koenig lookup) says that these
@@ -125,7 +126,7 @@ class GTEST_API_ Message {
// from the global namespace. With this using declaration,
// overloads of << defined in the global namespace and those
// visible via Koenig lookup are both exposed in this function.
- using ::operator <<;
+ using ::operator<<;
*ss_ << val;
return *this;
}
@@ -144,7 +145,7 @@ class GTEST_API_ Message {
// ensure consistent result across compilers, we always treat NULL
// as "(null)".
template <typename T>
- inline Message& operator <<(T* const& pointer) { // NOLINT
+ inline Message& operator<<(T* const& pointer) { // NOLINT
if (pointer == nullptr) {
*ss_ << "(null)";
} else {
@@ -159,25 +160,23 @@ class GTEST_API_ Message {
// templatized version above. Without this definition, streaming
// endl or other basic IO manipulators to Message will confuse the
// compiler.
- Message& operator <<(BasicNarrowIoManip val) {
+ Message& operator<<(BasicNarrowIoManip val) {
*ss_ << val;
return *this;
}
// Instead of 1/0, we want to see true/false for bool values.
- Message& operator <<(bool b) {
- return *this << (b ? "true" : "false");
- }
+ Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
// These two overloads allow streaming a wide C string to a Message
// using the UTF-8 encoding.
- Message& operator <<(const wchar_t* wide_c_str);
- Message& operator <<(wchar_t* wide_c_str);
+ Message& operator<<(const wchar_t* wide_c_str);
+ Message& operator<<(wchar_t* wide_c_str);
#if GTEST_HAS_STD_WSTRING
// Converts the given wide string to a narrow string using the UTF-8
// encoding, and streams the result to this Message object.
- Message& operator <<(const ::std::wstring& wstr);
+ Message& operator<<(const ::std::wstring& wstr);
#endif // GTEST_HAS_STD_WSTRING
// Gets the text streamed to this object so far as an std::string.
@@ -196,7 +195,7 @@ class GTEST_API_ Message {
};
// Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
return os << sb.GetString();
}
diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h
index 804e70281..b55119ac6 100644
--- a/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -26,11 +26,14 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// Macros and functions for implementing parameterized tests
// in Google C++ Testing and Mocking Framework (Google Test)
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
@@ -353,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) {
// }
// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
//
-inline internal::ParamGenerator<bool> Bool() {
- return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
// Combine() allows the user to combine two or more sequences to produce
// values of a Cartesian product of those sequences' elements.
@@ -428,8 +429,11 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
return 0; \
} \
static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
- GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \
- test_name)); \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &) = delete; /* NOLINT */ \
}; \
int GTEST_TEST_CLASS_NAME_(test_suite_name, \
test_name)::gtest_registering_dummy_ = \
@@ -453,43 +457,42 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
#define GTEST_GET_FIRST_(first, ...) first
#define GTEST_GET_SECOND_(first, second, ...) second
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \
- static ::testing::internal::ParamGenerator<test_suite_name::ParamType> \
- gtest_##prefix##test_suite_name##_EvalGenerator_() { \
- return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \
- } \
- static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \
- const ::testing::TestParamInfo<test_suite_name::ParamType>& info) { \
- if (::testing::internal::AlwaysFalse()) { \
- ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \
- __VA_ARGS__, \
- ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
- DUMMY_PARAM_))); \
- auto t = std::make_tuple(__VA_ARGS__); \
- static_assert(std::tuple_size<decltype(t)>::value <= 2, \
- "Too Many Args!"); \
- } \
- return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \
- __VA_ARGS__, \
- ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
- DUMMY_PARAM_))))(info); \
- } \
- static int gtest_##prefix##test_suite_name##_dummy_ \
- GTEST_ATTRIBUTE_UNUSED_ = \
- ::testing::UnitTest::GetInstance() \
- ->parameterized_test_registry() \
- .GetTestSuitePatternHolder<test_suite_name>( \
- GTEST_STRINGIFY_(test_suite_name), \
- ::testing::internal::CodeLocation(__FILE__, __LINE__)) \
- ->AddTestSuiteInstantiation( \
- GTEST_STRINGIFY_(prefix), \
- &gtest_##prefix##test_suite_name##_EvalGenerator_, \
- &gtest_##prefix##test_suite_name##_EvalGenerateName_, \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \
+ static ::testing::internal::ParamGenerator<test_suite_name::ParamType> \
+ gtest_##prefix##test_suite_name##_EvalGenerator_() { \
+ return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \
+ } \
+ static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \
+ const ::testing::TestParamInfo<test_suite_name::ParamType>& info) { \
+ if (::testing::internal::AlwaysFalse()) { \
+ ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \
+ __VA_ARGS__, \
+ ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+ DUMMY_PARAM_))); \
+ auto t = std::make_tuple(__VA_ARGS__); \
+ static_assert(std::tuple_size<decltype(t)>::value <= 2, \
+ "Too Many Args!"); \
+ } \
+ return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \
+ __VA_ARGS__, \
+ ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+ DUMMY_PARAM_))))(info); \
+ } \
+ static int gtest_##prefix##test_suite_name##_dummy_ \
+ GTEST_ATTRIBUTE_UNUSED_ = \
+ ::testing::UnitTest::GetInstance() \
+ ->parameterized_test_registry() \
+ .GetTestSuitePatternHolder<test_suite_name>( \
+ GTEST_STRINGIFY_(test_suite_name), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__)) \
+ ->AddTestSuiteInstantiation( \
+ GTEST_STRINGIFY_(prefix), \
+ &gtest_##prefix##test_suite_name##_EvalGenerator_, \
+ &gtest_##prefix##test_suite_name##_EvalGenerateName_, \
__FILE__, __LINE__)
-
// Allow Marking a Parameterized test class as not needing to be instantiated.
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \
namespace gtest_do_not_use_outside_namespace_scope {} \
static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
GTEST_STRINGIFY_(T))
diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h
index 076c9de1f..a91e8b8b1 100644
--- a/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -27,7 +27,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
// Google Test - The Google C++ Testing and Mocking Framework
//
// This file implements a universal value printer that can print a
@@ -95,7 +94,9 @@
// being defined as many user-defined container types don't have
// value_type.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
@@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter {
#endif
};
-
// Prints the given number of bytes in the given object to the given
// ostream.
GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
- size_t count,
- ::std::ostream* os);
+ size_t count, ::std::ostream* os);
struct RawBytesPrinter {
// SFINAE on `sizeof` to make sure we have a complete type.
template <typename T, size_t = sizeof(T)>
@@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
#endif
@@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
// to point to a NUL-terminated string, and thus can print it as a string.
#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
- template <> \
- class FormatForComparison<CharType*, OtherStringType> { \
- public: \
- static ::std::string Format(CharType* value) { \
- return ::testing::PrintToString(value); \
- } \
+ template <> \
+ class FormatForComparison<CharType*, OtherStringType> { \
+ public: \
+ static ::std::string Format(CharType* value) { \
+ return ::testing::PrintToString(value); \
+ } \
}
GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
@@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
- const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1& value,
+ const T2& /* other_operand */) {
return FormatForComparison<T1, T2>::Format(value);
}
@@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) {
}
#endif
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif // __SIZEOF_INT128__
+
// Overloads for C strings.
GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
inline void PrintTo(char* s, ::std::ostream* os) {
@@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
}
// Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
PrintStringTo(s, os);
}
@@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
// Overloads for ::std::wstring.
#if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
PrintWideStringTo(s, os);
}
@@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+ *os << internal::GetTypeName(info);
+}
+#endif // GTEST_HAS_RTTI
+
template <typename T>
void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
UniversalPrinter<T&>::Print(ref.get(), os);
@@ -744,6 +755,14 @@ class UniversalPrinter<Optional<T>> {
}
};
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+ static void Print(decltype(Nullopt()), ::std::ostream* os) {
+ *os << "(nullopt)";
+ }
+};
+
#endif // GTEST_INTERNAL_HAS_OPTIONAL
#if GTEST_INTERNAL_HAS_VARIANT
@@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
}
}
// This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
- const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+ ::std::ostream* os);
#ifdef __cpp_char8_t
// This overload prints a (const) char8_t array compactly.
@@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
::std::ostream* os);
// This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
- const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+ ::std::ostream* os);
// Implements printing an array type T[N].
template <typename T, size_t N>
@@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
UniversalPrinter<T1>::Print(value, os);
}
-typedef ::std::vector< ::std::string> Strings;
+typedef ::std::vector<::std::string> Strings;
- // Tersely prints the first N fields of a tuple to a string vector,
- // one element for each field.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
template <typename Tuple>
void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
Strings*) {}
diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h
index eacef4466..bec8c4810 100644
--- a/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -27,12 +27,9 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
// Utilities for testing Google Test itself and code that uses Google Test
// (e.g. frameworks built on top of Google Test).
-// GOOGLETEST_CM0004 DO NOT DELETE
-
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
@@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
TestPartResultReporterInterface* old_reporter_;
TestPartResultArray* const result_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+ ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+ delete;
+ ScopedFakeTestPartResultReporter& operator=(
+ const ScopedFakeTestPartResultReporter&) = delete;
};
namespace internal {
@@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker {
SingleFailureChecker(const TestPartResultArray* results,
TestPartResult::Type type, const std::string& substr);
~SingleFailureChecker();
+
private:
const TestPartResultArray* const results_;
const TestPartResult::Type type_;
const std::string substr_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+ SingleFailureChecker(const SingleFailureChecker&) = delete;
+ SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
};
} // namespace internal
@@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker {
GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
// A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures. It verifies that the given
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given
// statement will cause exactly one fatal Google Test failure with 'substr'
// being part of the failure message.
//
@@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
// helper macro, due to some peculiarity in how the preprocessor
// works. The AcceptsMacroThatExpandsToUnprotectedComma test in
// gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
- do { \
- class GTestExpectFatalFailureHelper {\
- public:\
- static void Execute() { statement; }\
- };\
- ::testing::TestPartResultArray gtest_failures;\
- ::testing::internal::SingleFailureChecker gtest_checker(\
- &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
- {\
- ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
- ::testing::ScopedFakeTestPartResultReporter:: \
- INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
- GTestExpectFatalFailureHelper::Execute();\
- }\
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+ do { \
+ class GTestExpectFatalFailureHelper { \
+ public: \
+ static void Execute() { statement; } \
+ }; \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter:: \
+ INTERCEPT_ONLY_CURRENT_THREAD, \
+ &gtest_failures); \
+ GTestExpectFatalFailureHelper::Execute(); \
+ } \
} while (::testing::internal::AlwaysFalse())
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
- do { \
- class GTestExpectFatalFailureHelper {\
- public:\
- static void Execute() { statement; }\
- };\
- ::testing::TestPartResultArray gtest_failures;\
- ::testing::internal::SingleFailureChecker gtest_checker(\
- &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
- {\
- ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
- ::testing::ScopedFakeTestPartResultReporter:: \
- INTERCEPT_ALL_THREADS, &gtest_failures);\
- GTestExpectFatalFailureHelper::Execute();\
- }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+ do { \
+ class GTestExpectFatalFailureHelper { \
+ public: \
+ static void Execute() { statement; } \
+ }; \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+ &gtest_failures); \
+ GTestExpectFatalFailureHelper::Execute(); \
+ } \
} while (::testing::internal::AlwaysFalse())
// A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures. It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
//
// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
// affects and considers failures generated in the current thread and
@@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
// instead of
// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
// to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
- do {\
- ::testing::TestPartResultArray gtest_failures;\
- ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+ do { \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
&gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
- (substr));\
- {\
- ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
- ::testing::ScopedFakeTestPartResultReporter:: \
- INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
- if (::testing::internal::AlwaysTrue()) { statement; }\
- }\
+ (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter:: \
+ INTERCEPT_ONLY_CURRENT_THREAD, \
+ &gtest_failures); \
+ if (::testing::internal::AlwaysTrue()) { \
+ statement; \
+ } \
+ } \
} while (::testing::internal::AlwaysFalse())
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
- do {\
- ::testing::TestPartResultArray gtest_failures;\
- ::testing::internal::SingleFailureChecker gtest_checker(\
- &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
- (substr));\
- {\
- ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+ do { \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+ (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
- &gtest_failures);\
- if (::testing::internal::AlwaysTrue()) { statement; }\
- }\
+ &gtest_failures); \
+ if (::testing::internal::AlwaysTrue()) { \
+ statement; \
+ } \
+ } \
} while (::testing::internal::AlwaysFalse())
#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h
index 203fdf98c..09cc8c34f 100644
--- a/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -26,14 +26,17 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
#include <iosfwd>
#include <vector>
+
#include "gtest/internal/gtest-internal.h"
#include "gtest/internal/gtest-string.h"
@@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray {
private:
std::vector<TestPartResult> array_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+ TestPartResultArray(const TestPartResultArray&) = delete;
+ TestPartResultArray& operator=(const TestPartResultArray&) = delete;
};
// This interface knows how to report a test part result.
@@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper
~HasNewFatalFailureHelper() override;
void ReportTestPartResult(const TestPartResult& result) override;
bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
private:
bool has_new_fatal_failure_;
TestPartResultReporterInterface* original_reporter_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+ HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+ HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
};
} // namespace internal
diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 9fdc6be10..bd35a3266 100644
--- a/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -27,7 +27,9 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
typedef ::testing::internal::GenerateTypeList<Types>::type \
GTEST_TYPE_PARAMS_(CaseName); \
typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
- GTEST_NAME_GENERATOR_(CaseName)
+ GTEST_NAME_GENERATOR_(CaseName)
#define TYPED_TEST(CaseName, TestName) \
static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \
@@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
// #included in multiple translation units linked together.
#define TYPED_TEST_SUITE_P(SuiteName) \
static ::testing::internal::TypedTestSuitePState \
- GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+ GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
// Legacy API is deprecated but still available
#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
REGISTER_TYPED_TEST_SUITE_P
#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \
- static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \
- "test-suit-prefix must not be empty"); \
- static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \
- ::testing::internal::TypeParameterizedTestSuite< \
- SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \
- ::testing::internal::GenerateTypeList<Types>::type>:: \
- Register(GTEST_STRINGIFY_(Prefix), \
- ::testing::internal::CodeLocation(__FILE__, __LINE__), \
- &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \
- GTEST_STRINGIFY_(SuiteName), \
- GTEST_REGISTERED_TEST_NAMES_(SuiteName), \
- ::testing::internal::GenerateNames< \
- ::testing::internal::NameGeneratorSelector< \
- __VA_ARGS__>::type, \
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \
+ static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \
+ "test-suit-prefix must not be empty"); \
+ static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \
+ ::testing::internal::TypeParameterizedTestSuite< \
+ SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \
+ ::testing::internal::GenerateTypeList<Types>::type>:: \
+ Register(GTEST_STRINGIFY_(Prefix), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+ &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \
+ GTEST_STRINGIFY_(SuiteName), \
+ GTEST_REGISTERED_TEST_NAMES_(SuiteName), \
+ ::testing::internal::GenerateNames< \
+ ::testing::internal::NameGeneratorSelector< \
+ __VA_ARGS__>::type, \
::testing::internal::GenerateTypeList<Types>::type>())
// Legacy API is deprecated but still available
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index 7a5d057c4..d19a587a1 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -27,7 +27,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file defines the public API for Google Test. It should be
@@ -47,8 +46,6 @@
// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
// easyUnit framework.
-// GOOGLETEST_CM0001 DO NOT DELETE
-
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
@@ -59,31 +56,22 @@
#include <type_traits>
#include <vector>
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-assertion-result.h"
#include "gtest/gtest-death-test.h"
#include "gtest/gtest-matchers.h"
#include "gtest/gtest-message.h"
#include "gtest/gtest-param-test.h"
#include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
#include "gtest/gtest-test-part.h"
#include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
/* class A needs to have dll-interface to be used by clients of class B */)
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
// Declares the flags.
// This flag temporary enables the disabled tests.
@@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
// is 1. If the value is -1 the tests are repeating forever.
GTEST_DECLARE_int32_(repeat);
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
// This flag controls whether Google Test includes Google Test internal
// stack frames in failure stack traces.
GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to);
GTEST_DECLARE_string_(flagfile);
#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
// The upper limit for valid stack trace depths.
const int kMaxStackTraceDepth = 100;
@@ -201,193 +205,6 @@ using TestCase = TestSuite;
class TestInfo;
class UnitTest;
-// A class for indicating whether an assertion was successful. When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-// 1. Defining predicate functions to be used with Boolean test assertions
-// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-// 2. Defining predicate-format functions to be
-// used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-// testing::AssertionResult IsEven(int n) {
-// if ((n % 2) == 0)
-// return testing::AssertionSuccess();
-// else
-// return testing::AssertionFailure() << n << " is odd";
-// }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-// Value of: IsEven(Fib(5))
-// Actual: false (5 is odd)
-// Expected: true
-//
-// instead of a more opaque
-//
-// Value of: IsEven(Fib(5))
-// Actual: false
-// Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-// testing::AssertionResult IsEven(int n) {
-// if ((n % 2) == 0)
-// return testing::AssertionSuccess() << n << " is even";
-// else
-// return testing::AssertionFailure() << n << " is odd";
-// }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-// Value of: IsEven(Fib(6))
-// Actual: true (8 is even)
-// Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-// // Verifies that Foo() returns an even number.
-// EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-// testing::AssertionResult IsEven(const char* expr, int n) {
-// if ((n % 2) == 0)
-// return testing::AssertionSuccess();
-// else
-// return testing::AssertionFailure()
-// << "Expected: " << expr << " is even\n Actual: it's " << n;
-// }
-//
-// If Foo() returns 5, you will see the following message:
-//
-// Expected: Foo() is even
-// Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
- // Copy constructor.
- // Used in EXPECT_TRUE/FALSE(assertion_result).
- AssertionResult(const AssertionResult& other);
-
-// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
-// This warning is not emitted in Visual Studio 2017.
-// This warning is off by default starting in Visual Studio 2019 but can be
-// enabled with command-line options.
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
- GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
- // Used in the EXPECT_TRUE/FALSE(bool_expression).
- //
- // T must be contextually convertible to bool.
- //
- // The second parameter prevents this overload from being considered if
- // the argument is implicitly convertible to AssertionResult. In that case
- // we want AssertionResult's copy constructor to be used.
- template <typename T>
- explicit AssertionResult(
- const T& success,
- typename std::enable_if<
- !std::is_convertible<T, AssertionResult>::value>::type*
- /*enabler*/
- = nullptr)
- : success_(success) {}
-
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
- GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
- // Assignment operator.
- AssertionResult& operator=(AssertionResult other) {
- swap(other);
- return *this;
- }
-
- // Returns true if and only if the assertion succeeded.
- operator bool() const { return success_; } // NOLINT
-
- // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
- AssertionResult operator!() const;
-
- // Returns the text streamed into this AssertionResult. Test assertions
- // use it when they fail (i.e., the predicate's outcome doesn't match the
- // assertion's expectation). When nothing has been streamed into the
- // object, returns an empty string.
- const char* message() const {
- return message_.get() != nullptr ? message_->c_str() : "";
- }
- // Deprecated; please use message() instead.
- const char* failure_message() const { return message(); }
-
- // Streams a custom failure message into this object.
- template <typename T> AssertionResult& operator<<(const T& value) {
- AppendMessage(Message() << value);
- return *this;
- }
-
- // Allows streaming basic output manipulators such as endl or flush into
- // this object.
- AssertionResult& operator<<(
- ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
- AppendMessage(Message() << basic_manipulator);
- return *this;
- }
-
- private:
- // Appends the contents of message to message_.
- void AppendMessage(const Message& a_message) {
- if (message_.get() == nullptr) message_.reset(new ::std::string);
- message_->append(a_message.GetString().c_str());
- }
-
- // Swap the contents of this AssertionResult with other.
- void swap(AssertionResult& other);
-
- // Stores result of the assertion predicate.
- bool success_;
- // Stores the message describing the condition in case the expectation
- // construct is not satisfied with the predicate's outcome.
- // Referenced via a pointer to avoid taking too much stack frame space
- // with test assertions.
- std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-} // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
// The abstract class that all tests inherit from.
//
// In Google Test, a unit test program contains one or many TestSuites, and
@@ -522,7 +339,8 @@ class GTEST_API_ Test {
virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
// We disallow copying Tests.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+ Test(const Test&) = delete;
+ Test& operator=(const Test&) = delete;
};
typedef internal::TimeInMillis TimeInMillis;
@@ -536,24 +354,17 @@ class TestProperty {
// C'tor. TestProperty does NOT have a default constructor.
// Always use this constructor (with parameters) to create a
// TestProperty object.
- TestProperty(const std::string& a_key, const std::string& a_value) :
- key_(a_key), value_(a_value) {
- }
+ TestProperty(const std::string& a_key, const std::string& a_value)
+ : key_(a_key), value_(a_value) {}
// Gets the user supplied key.
- const char* key() const {
- return key_.c_str();
- }
+ const char* key() const { return key_.c_str(); }
// Gets the user supplied value.
- const char* value() const {
- return value_.c_str();
- }
+ const char* value() const { return value_.c_str(); }
// Sets a new value, overriding the one supplied in the constructor.
- void SetValue(const std::string& new_value) {
- value_ = new_value;
- }
+ void SetValue(const std::string& new_value) { value_ = new_value; }
private:
// The key supplied by the user.
@@ -687,7 +498,8 @@ class GTEST_API_ TestResult {
TimeInMillis elapsed_time_;
// We disallow copying TestResult.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+ TestResult(const TestResult&) = delete;
+ TestResult& operator=(const TestResult&) = delete;
}; // class TestResult
// A TestInfo object stores the following information about a test:
@@ -811,8 +623,8 @@ class GTEST_API_ TestInfo {
}
// These fields are immutable properties of the test.
- const std::string test_suite_name_; // test suite name
- const std::string name_; // Test name
+ const std::string test_suite_name_; // test suite name
+ const std::string name_; // Test name
// Name of the parameter type, or NULL if this is not a typed or a
// type-parameterized test.
const std::unique_ptr<const ::std::string> type_param_;
@@ -833,7 +645,8 @@ class GTEST_API_ TestInfo {
// test for the second time.
TestResult result_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+ TestInfo(const TestInfo&) = delete;
+ TestInfo& operator=(const TestInfo&) = delete;
};
// A test suite, which consists of a vector of TestInfos.
@@ -941,7 +754,7 @@ class GTEST_API_ TestSuite {
// Adds a TestInfo to this test suite. Will delete the TestInfo upon
// destruction of the TestSuite object.
- void AddTestInfo(TestInfo * test_info);
+ void AddTestInfo(TestInfo* test_info);
// Clears the results of all tests in this test suite.
void ClearResult();
@@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite {
TestResult ad_hoc_test_result_;
// We disallow copying TestSuites.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+ TestSuite(const TestSuite&) = delete;
+ TestSuite& operator=(const TestSuite&) = delete;
};
// An Environment object is capable of setting up and tearing down an
@@ -1069,6 +883,7 @@ class Environment {
// Override this to define how to tear down the environment.
virtual void TearDown() {}
+
private:
// If you see an error about overriding the following function or
// about it being private, you have mis-spelled SetUp() as Setup().
@@ -1120,6 +935,9 @@ class TestEventListener {
// Fired before the test starts.
virtual void OnTestStart(const TestInfo& test_info) = 0;
+ // Fired when a test is disabled
+ virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
// Fired after a failed assertion or a SUCCEED() invocation.
// If you want to throw an exception from this function to skip to the next
// TEST, it must be AssertionException defined above, or inherited from it.
@@ -1143,8 +961,7 @@ class TestEventListener {
virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
// Fired after each iteration of tests finishes.
- virtual void OnTestIterationEnd(const UnitTest& unit_test,
- int iteration) = 0;
+ virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
// Fired after all test activities have ended.
virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener {
#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
void OnTestStart(const TestInfo& /*test_info*/) override {}
+ void OnTestDisabled(const TestInfo& /*test_info*/) override {}
void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
void OnTestEnd(const TestInfo& /*test_info*/) override {}
void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
@@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners {
TestEventListener* default_xml_generator_;
// We disallow copying TestEventListeners.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+ TestEventListeners(const TestEventListeners&) = delete;
+ TestEventListeners& operator=(const TestEventListeners&) = delete;
};
// A UnitTest consists of a vector of TestSuites.
@@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest {
// Returns the TestInfo object for the test that's currently running,
// or NULL if no test is running.
- const TestInfo* current_test_info() const
- GTEST_LOCK_EXCLUDED_(mutex_);
+ const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
// Returns the random seed used at the start of the current test run.
int random_seed() const;
@@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest {
// eventually call this to report their results. The user code
// should use the assertion macros instead of calling this directly.
void AddTestPartResult(TestPartResult::Type result_type,
- const char* file_name,
- int line_number,
+ const char* file_name, int line_number,
const std::string& message,
const std::string& os_stack_trace)
GTEST_LOCK_EXCLUDED_(mutex_);
@@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest {
friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
friend internal::UnitTestImpl* internal::GetUnitTestImpl();
friend void internal::ReportFailureInUnknownLocation(
- TestPartResult::Type result_type,
- const std::string& message);
+ TestPartResult::Type result_type, const std::string& message);
// Creates an empty UnitTest.
UnitTest();
@@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest {
GTEST_LOCK_EXCLUDED_(mutex_);
// Pops a trace from the per-thread Google Test trace stack.
- void PopGTestTrace()
- GTEST_LOCK_EXCLUDED_(mutex_);
+ void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
// Protects mutable state in *impl_. This is mutable as some const
// methods need to lock it too.
@@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest {
internal::UnitTestImpl* impl_;
// We disallow copying UnitTest.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+ UnitTest(const UnitTest&) = delete;
+ UnitTest& operator=(const UnitTest&) = delete;
};
// A convenient wrapper for adding an environment for the test
@@ -1520,13 +1336,11 @@ namespace internal {
// when calling EXPECT_* in a tight loop.
template <typename T1, typename T2>
AssertionResult CmpHelperEQFailure(const char* lhs_expression,
- const char* rhs_expression,
- const T1& lhs, const T2& rhs) {
- return EqFailure(lhs_expression,
- rhs_expression,
+ const char* rhs_expression, const T1& lhs,
+ const T2& rhs) {
+ return EqFailure(lhs_expression, rhs_expression,
FormatForComparisonFailureMessage(lhs, rhs),
- FormatForComparisonFailureMessage(rhs, lhs),
- false);
+ FormatForComparisonFailureMessage(rhs, lhs), false);
}
// This block of code defines operator==/!=
@@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; }
// The helper function for {ASSERT|EXPECT}_EQ.
template <typename T1, typename T2>
AssertionResult CmpHelperEQ(const char* lhs_expression,
- const char* rhs_expression,
- const T1& lhs,
+ const char* rhs_expression, const T1& lhs,
const T2& rhs) {
if (lhs == rhs) {
return AssertionSuccess();
@@ -1571,8 +1384,7 @@ class EqHelper {
// Even though its body looks the same as the above version, we
// cannot merge the two, as it will make anonymous enums unhappy.
static AssertionResult Compare(const char* lhs_expression,
- const char* rhs_expression,
- BiggestInt lhs,
+ const char* rhs_expression, BiggestInt lhs,
BiggestInt rhs) {
return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
}
@@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
- const T1& val1, const T2& val2) {\
- if (val1 op val2) {\
- return AssertionSuccess();\
- } else {\
- return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
- }\
-}
+#define GTEST_IMPL_CMP_HELPER_(op_name, op) \
+ template <typename T1, typename T2> \
+ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+ const T1& val1, const T2& val2) { \
+ if (val1 op val2) { \
+ return AssertionSuccess(); \
+ } else { \
+ return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \
+ } \
+ }
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
@@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >)
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
const char* s2_expression,
- const char* s1,
- const char* s2);
+ const char* s1, const char* s2);
// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
const char* s2_expression,
- const char* s1,
- const char* s2);
+ const char* s1, const char* s2);
// The helper function for {ASSERT|EXPECT}_STRNE.
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
const char* s2_expression,
- const char* s1,
- const char* s2);
+ const char* s1, const char* s2);
// The helper function for {ASSERT|EXPECT}_STRCASENE.
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
const char* s2_expression,
- const char* s1,
- const char* s2);
-
+ const char* s1, const char* s2);
// Helper function for *_STREQ on wide strings.
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
const char* s2_expression,
- const wchar_t* s1,
- const wchar_t* s2);
+ const wchar_t* s1, const wchar_t* s2);
// Helper function for *_STRNE on wide strings.
//
// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
const char* s2_expression,
- const wchar_t* s1,
- const wchar_t* s2);
+ const wchar_t* s1, const wchar_t* s2);
} // namespace internal
@@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
//
// The {needle,haystack}_expr arguments are the stringified
// expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const char* needle,
+ const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const wchar_t* needle,
+ const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const char* needle,
+ const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const wchar_t* needle,
+ const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack);
#if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack);
#endif // GTEST_HAS_STD_WSTRING
namespace internal {
@@ -1732,8 +1545,7 @@ namespace internal {
template <typename RawType>
AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
const char* rhs_expression,
- RawType lhs_value,
- RawType rhs_value) {
+ RawType lhs_value, RawType rhs_value) {
const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
if (lhs.AlmostEquals(rhs)) {
@@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
<< rhs_value;
- return EqFailure(lhs_expression,
- rhs_expression,
- StringStreamToString(&lhs_ss),
- StringStreamToString(&rhs_ss),
+ return EqFailure(lhs_expression, rhs_expression,
+ StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
false);
}
@@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
const char* expr2,
const char* abs_error_expr,
- double val1,
- double val2,
+ double val1, double val2,
double abs_error);
// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
class GTEST_API_ AssertHelper {
public:
// Constructor.
- AssertHelper(TestPartResult::Type type,
- const char* file,
- int line,
+ AssertHelper(TestPartResult::Type type, const char* file, int line,
const char* message);
~AssertHelper();
@@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper {
// re-using stack space even for temporary variables, so every EXPECT_EQ
// reserves stack space for another AssertHelper.
struct AssertHelperData {
- AssertHelperData(TestPartResult::Type t,
- const char* srcfile,
- int line_num,
+ AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
const char* msg)
- : type(t), file(srcfile), line(line_num), message(msg) { }
+ : type(t), file(srcfile), line(line_num), message(msg) {}
TestPartResult::Type const type;
const char* const file;
@@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper {
std::string const message;
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+ AssertHelperData(const AssertHelperData&) = delete;
+ AssertHelperData& operator=(const AssertHelperData&) = delete;
};
AssertHelperData* const data_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+ AssertHelper(const AssertHelper&) = delete;
+ AssertHelper& operator=(const AssertHelper&) = delete;
};
} // namespace internal
@@ -1860,15 +1667,14 @@ class WithParamInterface {
private:
// Sets parameter value. The caller is responsible for making sure the value
// remains alive and unchanged throughout the current test.
- static void SetParam(const ParamType* parameter) {
- parameter_ = parameter;
- }
+ static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
// Static value used for accessing parameter during a test lifetime.
static const ParamType* parameter_;
// TestClass must be a subclass of WithParamInterface<T> and Test.
- template <class TestClass> friend class internal::ParameterizedTestFactory;
+ template <class TestClass>
+ friend class internal::ParameterizedTestFactory;
};
template <typename T>
@@ -1878,8 +1684,7 @@ const T* WithParamInterface<T>::parameter_ = nullptr;
// WithParamInterface, and can just inherit from ::testing::TestWithParam.
template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
+class TestWithParam : public Test, public WithParamInterface<T> {};
// Macros for indicating success/failure in test code.
@@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// Generates a nonfatal failure at the given source file location with
// a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line) \
GTEST_MESSAGE_AT_(file, line, "Failed", \
::testing::TestPartResult::kNonFatalFailure)
@@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// Define this macro to 1 to omit the definition of FAIL(), which is a
// generic name and clashes with some other libraries.
#if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
#endif
// Generates a success with a generic message.
@@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// Define this macro to 1 to omit the definition of SUCCEED(), which
// is a generic name and clashes with some other libraries.
#if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
#endif
// Macros for testing exceptions.
@@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// Boolean assertions. Condition can be either a Boolean expression or an
// AssertionResult. For more information on how to use AssertionResult with
// these macros see comments on that class.
-#define GTEST_EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition) \
GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
GTEST_NONFATAL_FAILURE_)
-#define GTEST_EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition) \
GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
GTEST_NONFATAL_FAILURE_)
#define GTEST_ASSERT_TRUE(condition) \
- GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
- GTEST_FATAL_FAILURE_)
-#define GTEST_ASSERT_FALSE(condition) \
+ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition) \
GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
GTEST_FATAL_FAILURE_)
@@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// ASSERT_XY(), which clashes with some users' own code.
#if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
#endif
#if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
#endif
#if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
#endif
#if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
#endif
#if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
#endif
#if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
#endif
// C-string Comparisons. All tests treat NULL and any non-NULL string
@@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
#define EXPECT_STRCASEEQ(s1, s2) \
EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
#define ASSERT_STREQ(s1, s2) \
@@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
#define ASSERT_STRCASEEQ(s1, s2) \
ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
// Macros for comparing floating-point numbers.
@@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
// FloatingPoint template class in gtest-internal.h if you are
// interested in the implementation details.
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2) \
EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
val1, val2)
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2) \
EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
val1, val2)
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2) \
ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
val1, val2)
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2) \
ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
val1, val2)
-#define EXPECT_NEAR(val1, val2, abs_error)\
- EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
- val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error) \
+ EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+ abs_error)
-#define ASSERT_NEAR(val1, val2, abs_error)\
- ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
- val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error) \
+ ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+ abs_error)
// These predicate format functions work on floating-point values, and
// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
double val1, double val2);
-
#if GTEST_OS_WINDOWS
// Macros that test for HRESULT failure and success, these are only useful
@@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
// expected result and the actual result with both a human-readable
// string representation of the error, if available, as well as the
// hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
- EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+ EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
- ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+ ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-# define EXPECT_HRESULT_FAILED(expr) \
- EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+ EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
-# define ASSERT_HRESULT_FAILED(expr) \
- ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+ ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
#endif // GTEST_OS_WINDOWS
@@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
//
#define ASSERT_NO_FATAL_FAILURE(statement) \
- GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+ GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
#define EXPECT_NO_FATAL_FAILURE(statement) \
- GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+ GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
// Causes a trace (including the given source file path and line number,
// and the given message) to be included in every test failure message generated
@@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace {
private:
void PushTrace(const char* file, int line, std::string message);
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+ ScopedTrace(const ScopedTrace&) = delete;
+ ScopedTrace& operator=(const ScopedTrace&) = delete;
} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its
// c'tor and d'tor. Therefore it doesn't
// need to be used otherwise.
@@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace {
// Assuming that each thread maintains its own stack of traces.
// Therefore, a SCOPED_TRACE() would (correctly) only affect the
// assertions in its own thread.
-#define SCOPED_TRACE(message) \
- ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
- __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message) \
+ ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+ __FILE__, __LINE__, (message))
// Compile-time assertion for type equality.
// StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept {
// EXPECT_EQ(a_.size(), 0);
// EXPECT_EQ(b_.size(), 1);
// }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#if !GTEST_DONT_DEFINE_TEST
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name) \
GTEST_TEST_(test_fixture, test_name, test_fixture, \
::testing::internal::GetTypeId<test_fixture>())
-#endif // !GTEST_DONT_DEFINE_TEST
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
// Returns a path to temporary directory.
// Tries to determine an appropriate directory for the platform.
GTEST_API_ std::string TempDir();
#ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
#endif
// Dynamically registers a test with the framework.
@@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir();
// }
// ...
// int main(int argc, char** argv) {
+// ::testing::InitGoogleTest(&argc, argv);
// std::vector<int> values_to_test = LoadValuesFromConfig();
// RegisterMyTests(values_to_test);
// ...
@@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
// namespace and has an all-caps name.
int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
-inline int RUN_ALL_TESTS() {
- return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index 5029a9bb0..47a24aa68 100644
--- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -26,17 +26,19 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND!
//
// Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#include "gtest/gtest.h"
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
namespace testing {
@@ -72,22 +74,18 @@ namespace testing {
// GTEST_ASSERT_ is the basic statement to which all of the assertions
// in this file reduce. Don't use this in your code.
-#define GTEST_ASSERT_(expression, on_failure) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
if (const ::testing::AssertionResult gtest_ar = (expression)) \
- ; \
- else \
+ ; \
+ else \
on_failure(gtest_ar.failure_message())
-
// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use
// this in your code.
-template <typename Pred,
- typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
- const char* e1,
- Pred pred,
- const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+ Pred pred, const T1& v1) {
if (pred(v1)) return AssertionSuccess();
return AssertionFailure()
@@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text,
// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
- GTEST_ASSERT_(pred_format(#v1, v1), \
- on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use
// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
- GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
- #v1, \
- pred, \
- v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
// Unary predicate assertion macros.
#define EXPECT_PRED_FORMAT1(pred_format, v1) \
GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
- GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
#define ASSERT_PRED_FORMAT1(pred_format, v1) \
GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
- GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use
// this in your code.
-template <typename Pred,
- typename T1,
- typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
- const char* e1,
- const char* e2,
- Pred pred,
- const T1& v1,
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+ const char* e2, Pred pred, const T1& v1,
const T2& v2) {
if (pred(v1, v2)) return AssertionSuccess();
@@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text,
// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
- GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
- on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use
// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
- GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
- #v1, \
- #v2, \
- pred, \
- v1, \
- v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+ on_failure)
// Binary predicate assertion macros.
#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text,
#define ASSERT_PRED2(pred, v1, v2) \
GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use
// this in your code.
-template <typename Pred,
- typename T1,
- typename T2,
- typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
- const char* e1,
- const char* e2,
- const char* e3,
- Pred pred,
- const T1& v1,
- const T2& v2,
- const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3, Pred pred,
+ const T1& v1, const T2& v2, const T3& v3) {
if (pred(v1, v2, v3)) return AssertionSuccess();
return AssertionFailure()
@@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text,
// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
- GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
- on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use
// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
- GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
- #v1, \
- #v2, \
- #v3, \
- pred, \
- v1, \
- v2, \
- v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \
+ GTEST_ASSERT_( \
+ ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+ on_failure)
// Ternary predicate assertion macros.
#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text,
#define ASSERT_PRED3(pred, v1, v2, v3) \
GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
-
-
// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use
// this in your code.
-template <typename Pred,
- typename T1,
- typename T2,
- typename T3,
- typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
- const char* e1,
- const char* e2,
- const char* e3,
- const char* e4,
- Pred pred,
- const T1& v1,
- const T2& v2,
- const T3& v3,
- const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3,
+ const char* e4, Pred pred, const T1& v1,
+ const T2& v2, const T3& v3, const T4& v4) {
if (pred(v1, v2, v3, v4)) return AssertionSuccess();
return AssertionFailure()
@@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
- GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
- on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use
// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
- GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
- #v1, \
- #v2, \
- #v3, \
- #v4, \
- pred, \
- v1, \
- v2, \
- v3, \
- v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+ v1, v2, v3, v4), \
+ on_failure)
// 4-ary predicate assertion macros.
#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-
-
// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use
// this in your code.
-template <typename Pred,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
- const char* e1,
- const char* e2,
- const char* e3,
- const char* e4,
- const char* e5,
- Pred pred,
- const T1& v1,
- const T2& v2,
- const T3& v3,
- const T4& v4,
- const T5& v5) {
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3,
+ const char* e4, const char* e5, Pred pred,
+ const T1& v1, const T2& v2, const T3& v3,
+ const T4& v4, const T5& v5) {
if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
return AssertionFailure()
@@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text,
// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \
GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
on_failure)
// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use
// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
- GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
- #v1, \
- #v2, \
- #v3, \
- #v4, \
- #v5, \
- pred, \
- v1, \
- v2, \
- v3, \
- v4, \
- v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+ pred, v1, v2, v3, v4, v5), \
+ on_failure)
// 5-ary predicate assertion macros.
#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-
-
} // namespace testing
#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h
index 38b9d85a5..1f37dc31c 100644
--- a/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -27,9 +27,8 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -55,7 +54,7 @@
// Note: The test class must be in the same namespace as the class being tested.
// For example, putting MyClassTest in an anonymous namespace will not work.
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+ friend class test_case_name##_##test_name##_Test
#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/README.md b/third_party/googletest/src/include/gtest/internal/custom/README.md
index ff391fb4e..cb49e2c75 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/README.md
+++ b/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations.
The following macros can be defined:
-### Flag related macros:
-
-* `GTEST_FLAG(flag_name)`
-* `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
- own flagfile flag parsing.
-* `GTEST_DECLARE_bool_(name)`
-* `GTEST_DECLARE_int32_(name)`
-* `GTEST_DECLARE_string_(name)`
-* `GTEST_DEFINE_bool_(name, default_val, doc)`
-* `GTEST_DEFINE_int32_(name, default_val, doc)`
-* `GTEST_DEFINE_string_(name, default_val, doc)`
-
### Logging:
* `GTEST_LOG_(severity)`
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index db02881c0..9b7fb4261 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -34,4 +34,35 @@
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+ Notification() = delete;
+ Notification(const Notification&) = delete;
+ Notification& operator=(const Notification&) = delete;
+ // void Notify();
+ void WaitForNotification() {}
+};
+} // namespace internal
+} // namespace testing
+
#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 490296dfa..45580ae80 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -26,27 +26,31 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file defines internal utilities needed for implementing
// death tests. They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#include <stdio.h>
+
+#include <memory>
+
#include "gtest/gtest-matchers.h"
#include "gtest/internal/gtest-internal.h"
-#include <stdio.h>
-#include <memory>
+GTEST_DECLARE_string_(internal_run_death_test);
namespace testing {
namespace internal {
-GTEST_DECLARE_string_(internal_run_death_test);
-
// Names of the flags (needed for parsing Google Test flags).
const char kDeathTestStyleFlag[] = "death_test_style";
const char kDeathTestUseFork[] = "death_test_use_fork";
@@ -83,16 +87,18 @@ class GTEST_API_ DeathTest {
static bool Create(const char* statement, Matcher<const std::string&> matcher,
const char* file, int line, DeathTest** test);
DeathTest();
- virtual ~DeathTest() { }
+ virtual ~DeathTest() {}
// A helper class that aborts a death test when it's deleted.
class ReturnSentinel {
public:
- explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+ explicit ReturnSentinel(DeathTest* test) : test_(test) {}
~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
private:
DeathTest* const test_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+ ReturnSentinel(const ReturnSentinel&) = delete;
+ ReturnSentinel& operator=(const ReturnSentinel&) = delete;
} GTEST_ATTRIBUTE_UNUSED_;
// An enumeration of possible roles that may be taken when a death
@@ -137,7 +143,8 @@ class GTEST_API_ DeathTest {
// A string containing a description of the outcome of the last death test.
static std::string last_death_test_message_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+ DeathTest(const DeathTest&) = delete;
+ DeathTest& operator=(const DeathTest&) = delete;
};
GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
@@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
// Factory interface for death tests. May be mocked out for testing.
class DeathTestFactory {
public:
- virtual ~DeathTestFactory() { }
+ virtual ~DeathTestFactory() {}
virtual bool Create(const char* statement,
Matcher<const std::string&> matcher, const char* file,
int line, DeathTest** test) = 0;
@@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
// Traps C++ exceptions escaping statement and reports them as test
// failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
- try { \
- GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
- } catch (const ::std::exception& gtest_exception) { \
- fprintf(\
- stderr, \
- "\n%s: Caught std::exception-derived exception escaping the " \
- "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } catch (const ::std::exception& gtest_exception) { \
+ fprintf( \
+ stderr, \
+ "\n%s: Caught std::exception-derived exception escaping the " \
+ "death test statement. Exception message: %s\n", \
::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
- gtest_exception.what()); \
- fflush(stderr); \
+ gtest_exception.what()); \
+ fflush(stderr); \
death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
- } catch (...) { \
+ } catch (...) { \
death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
}
-# else
-# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-# endif
+#endif
// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
// ASSERT_EXIT*, and EXPECT_EXIT*.
@@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
break; \
} \
- default: \
- break; \
} \
} \
} else \
@@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
// RUN_ALL_TESTS was called.
class InternalRunDeathTestFlag {
public:
- InternalRunDeathTestFlag(const std::string& a_file,
- int a_line,
- int an_index,
+ InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
int a_write_fd)
- : file_(a_file), line_(a_line), index_(an_index),
- write_fd_(a_write_fd) {}
+ : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
~InternalRunDeathTestFlag() {
- if (write_fd_ >= 0)
- posix::Close(write_fd_);
+ if (write_fd_ >= 0) posix::Close(write_fd_);
}
const std::string& file() const { return file_; }
@@ -288,7 +289,8 @@ class InternalRunDeathTestFlag {
int index_;
int write_fd_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+ InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+ InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
};
// Returns a newly created InternalRunDeathTestFlag object with fields
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index 0c033abc3..a2a60a962 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -26,7 +26,7 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// Google Test filepath utilities
//
// This header file declares classes and functions used internally by
@@ -35,7 +35,9 @@
// This file is #included in gtest/internal/gtest-internal.h.
// Do not include this header file separately!
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
@@ -61,8 +63,8 @@ namespace internal {
class GTEST_API_ FilePath {
public:
- FilePath() : pathname_("") { }
- FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+ FilePath() : pathname_("") {}
+ FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
explicit FilePath(const std::string& pathname) : pathname_(pathname) {
Normalize();
@@ -73,9 +75,7 @@ class GTEST_API_ FilePath {
return *this;
}
- void Set(const FilePath& rhs) {
- pathname_ = rhs.pathname_;
- }
+ void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
const std::string& string() const { return pathname_; }
const char* c_str() const { return pathname_.c_str(); }
@@ -88,8 +88,7 @@ class GTEST_API_ FilePath {
// than zero (e.g., 12), returns "dir/test_12.xml".
// On Windows platform, uses \ as the separator rather than /.
static FilePath MakeFileName(const FilePath& directory,
- const FilePath& base_name,
- int number,
+ const FilePath& base_name, int number,
const char* extension);
// Given directory = "dir", relative_path = "test.xml",
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index f8cbdbd81..9b04e4c85 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -26,13 +26,15 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file declares functions and macros used internally by
// Google Test. They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
@@ -40,19 +42,20 @@
#include "gtest/internal/gtest-port.h"
#if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
#endif // GTEST_OS_LINUX
#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
#endif
#include <ctype.h>
#include <float.h>
#include <string.h>
+
#include <cstdint>
#include <iomanip>
#include <limits>
@@ -76,7 +79,7 @@
// the current line number. For more details, see
// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
// Stringifies its argument.
// Work around a bug in visual studio which doesn't accept code like this:
@@ -98,21 +101,21 @@ namespace testing {
// Forward declarations.
-class AssertionResult; // Result of an assertion.
-class Message; // Represents a failure message.
-class Test; // Represents a test.
-class TestInfo; // Information about a test.
-class TestPartResult; // Result of a test part.
-class UnitTest; // A collection of test suites.
+class AssertionResult; // Result of an assertion.
+class Message; // Represents a failure message.
+class Test; // Represents a test.
+class TestInfo; // Information about a test.
+class TestPartResult; // Result of a test part.
+class UnitTest; // A collection of test suites.
template <typename T>
::std::string PrintToString(const T& value);
namespace internal {
-struct TraceInfo; // Information about a trace point.
-class TestInfoImpl; // Opaque implementation of TestInfo
-class UnitTestImpl; // Opaque implementation of UnitTest
+struct TraceInfo; // Information about a trace point.
+class TestInfoImpl; // Opaque implementation of TestInfo
+class UnitTestImpl; // Opaque implementation of UnitTest
// The text used in failure messages to indicate the start of the
// stack trace.
@@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[];
// An IgnoredValue object can be implicitly constructed from ANY value.
class IgnoredValue {
struct Sink {};
+
public:
// This constructor template allows any value to be implicitly
// converted to IgnoredValue. The object has no data member and
@@ -136,13 +140,13 @@ class IgnoredValue {
};
// Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
- const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+ const Message& user_msg);
#if GTEST_HAS_EXCEPTIONS
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+ 4275 /* an exported class was derived from a class that was not exported */)
// This exception is thrown by (and only by) a failed Google Test
// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
} // namespace edit_distance
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
- const std::string& right,
- size_t* total_line_count);
-
// Constructs and returns the message for an equality assertion
// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
//
@@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
GTEST_API_ std::string GetBoolAssertionFailureMessage(
- const AssertionResult& assertion_result,
- const char* expression_text,
- const char* actual_predicate_value,
- const char* expected_predicate_value);
+ const AssertionResult& assertion_result, const char* expression_text,
+ const char* actual_predicate_value, const char* expected_predicate_value);
// This template class represents an IEEE floating-point number
// (either single-precision or double-precision, depending on the
@@ -256,11 +250,11 @@ class FloatingPoint {
// Constants.
// # of bits in a number.
- static const size_t kBitCount = 8*sizeof(RawType);
+ static const size_t kBitCount = 8 * sizeof(RawType);
// # of fraction bits in a number.
static const size_t kFractionBitCount =
- std::numeric_limits<RawType>::digits - 1;
+ std::numeric_limits<RawType>::digits - 1;
// # of exponent bits in a number.
static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -269,8 +263,8 @@ class FloatingPoint {
static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
// The mask for the fraction bits.
- static const Bits kFractionBitMask =
- ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+ static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+ (kExponentBitCount + 1);
// The mask for the exponent bits.
static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -309,9 +303,7 @@ class FloatingPoint {
}
// Returns the floating-point number that represent positive infinity.
- static RawType Infinity() {
- return ReinterpretBits(kExponentBitMask);
- }
+ static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
// Returns the maximum representable finite floating-point number.
static RawType Max();
@@ -319,7 +311,7 @@ class FloatingPoint {
// Non-static methods
// Returns the bits that represents this number.
- const Bits &bits() const { return u_.bits_; }
+ const Bits& bits() const { return u_.bits_; }
// Returns the exponent bits of this number.
Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
@@ -348,8 +340,8 @@ class FloatingPoint {
// a NAN must return false.
if (is_nan() || rhs.is_nan()) return false;
- return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
- <= kMaxUlps;
+ return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+ kMaxUlps;
}
private:
@@ -374,7 +366,7 @@ class FloatingPoint {
//
// Read http://en.wikipedia.org/wiki/Signed_number_representations
// for more details on signed number representations.
- static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+ static Bits SignAndMagnitudeToBiased(const Bits& sam) {
if (kSignBitMask & sam) {
// sam represents a negative number.
return ~sam + 1;
@@ -386,8 +378,8 @@ class FloatingPoint {
// Given two numbers in the sign-and-magnitude representation,
// returns the distance between them as an unsigned number.
- static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
- const Bits &sam2) {
+ static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+ const Bits& sam2) {
const Bits biased1 = SignAndMagnitudeToBiased(sam1);
const Bits biased2 = SignAndMagnitudeToBiased(sam2);
return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
@@ -399,9 +391,13 @@ class FloatingPoint {
// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
// macro defined by <windows.h>.
template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+ return FLT_MAX;
+}
template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+ return DBL_MAX;
+}
// Typedefs the instances of the FloatingPoint template class that we
// care to use.
@@ -461,7 +457,8 @@ class TestFactoryBase {
TestFactoryBase() {}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+ TestFactoryBase(const TestFactoryBase&) = delete;
+ TestFactoryBase& operator=(const TestFactoryBase&) = delete;
};
// This class provides implementation of TeastFactoryBase interface.
@@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
template <typename T>
// Note that SuiteApiResolver inherits from T because
-// SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+// SetUpTestSuite()/TearDownTestSuite() could be protected. This way
// SuiteApiResolver can access them.
struct SuiteApiResolver : T {
// testing::Test is only forward declared at this point. So we make it a
- // dependend class for the compiler to be OK with it.
+ // dependent class for the compiler to be OK with it.
using Test =
typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
@@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) {
if (comma == nullptr) {
return nullptr;
}
- while (IsSpace(*(++comma))) {}
+ while (IsSpace(*(++comma))) {
+ }
return comma;
}
@@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) {
// Splits a given string on a given delimiter, populating a given
// vector with the fields.
void SplitString(const ::std::string& str, char delimiter,
- ::std::vector< ::std::string>* dest);
+ ::std::vector<::std::string>* dest);
// The default argument to the template below for the case when the user does
// not provide a name generator.
@@ -781,13 +779,13 @@ class TypeParameterizedTestSuite {
const std::vector<std::string>& type_names =
GenerateNames<DefaultNameGenerator, Types>()) {
RegisterTypeParameterizedTestSuiteInstantiation(case_name);
- std::string test_name = StripTrailingSpaces(
- GetPrefixUntilComma(test_names));
+ std::string test_name =
+ StripTrailingSpaces(GetPrefixUntilComma(test_names));
if (!state->TestExists(test_name)) {
fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
case_name, test_name.c_str(),
- FormatFileLocation(code_location.file.c_str(),
- code_location.line).c_str());
+ FormatFileLocation(code_location.file.c_str(), code_location.line)
+ .c_str());
fflush(stderr);
posix::Abort();
}
@@ -831,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
// For example, if Foo() calls Bar(), which in turn calls
// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
- UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+ int skip_count);
// Helpers for suppressing warnings on unreachable code or constant
// condition.
@@ -881,7 +879,8 @@ class GTEST_API_ Random {
private:
uint32_t state_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+ Random(const Random&) = delete;
+ Random& operator=(const Random&) = delete;
};
// Turns const U&, U&, const U, and U all into U.
@@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) {
typedef char IsNotContainer;
template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+ return '\0';
+}
// Trait to detect whether a type T is a hash table.
// The heuristic used is that the type contains an inner type `hasher` and does
@@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs);
// This generic version is used when k is 0.
template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+ return lhs == rhs;
+}
// This overload is used when k >= 1.
template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
return internal::ArrayEq(lhs, N, rhs);
}
@@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
template <typename T, typename U>
bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
for (size_t i = 0; i != size; i++) {
- if (!internal::ArrayEq(lhs[i], rhs[i]))
- return false;
+ if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
}
return true;
}
@@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
template <typename Iter, typename Element>
Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
for (Iter it = begin; it != end; ++it) {
- if (internal::ArrayEq(*it, elem))
- return it;
+ if (internal::ArrayEq(*it, elem)) return it;
}
return end;
}
@@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to);
// This generic version is used when k is 0.
template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T& from, U* to) {
+ *to = from;
+}
// This overload is used when k >= 1.
template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
internal::CopyArray(from, N, *to);
}
@@ -1114,8 +1117,7 @@ class NativeArray {
}
~NativeArray() {
- if (clone_ != &NativeArray::InitRef)
- delete[] array_;
+ if (clone_ != &NativeArray::InitRef) delete[] array_;
}
// STL-style container methods.
@@ -1123,8 +1125,7 @@ class NativeArray {
const_iterator begin() const { return array_; }
const_iterator end() const { return array_ + size_; }
bool operator==(const NativeArray& rhs) const {
- return size() == rhs.size() &&
- ArrayEq(begin(), size(), rhs.begin());
+ return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
}
private:
@@ -1335,9 +1336,9 @@ struct tuple_size<testing::internal::FlatTuple<Ts...>>
#endif
} // namespace std
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
- ::testing::internal::AssertHelper(result_type, file, line, message) \
- = ::testing::Message()
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+ ::testing::internal::AssertHelper(result_type, file, line, message) = \
+ ::testing::Message()
#define GTEST_MESSAGE_(message, result_type) \
GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1458,103 +1459,112 @@ class NeverThrown {
#endif // GTEST_HAS_EXCEPTIONS
-#define GTEST_TEST_NO_THROW_(statement, fail) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (::testing::internal::TrueWithString gtest_msg{}) { \
- try { \
- GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
- } \
- GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
- catch (...) { \
- gtest_msg.value = "it throws."; \
- goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
- } \
- } else \
- GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
- fail(("Expected: " #statement " doesn't throw an exception.\n" \
- " Actual: " + gtest_msg.value).c_str())
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (::testing::internal::AlwaysTrue()) { \
- bool gtest_caught_any = false; \
- try { \
- GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
- } \
- catch (...) { \
- gtest_caught_any = true; \
- } \
- if (!gtest_caught_any) { \
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::TrueWithString gtest_msg{}) { \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } \
+ GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
+ catch (...) { \
+ gtest_msg.value = "it throws."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \
+ : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+ " Actual: " + \
+ gtest_msg.value) \
+ .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ bool gtest_caught_any = false; \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } catch (...) { \
+ gtest_caught_any = true; \
+ } \
+ if (!gtest_caught_any) { \
goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
- } \
- } else \
- GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
- fail("Expected: " #statement " throws an exception.\n" \
- " Actual: it doesn't.")
-
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \
+ : fail("Expected: " #statement \
+ " throws an exception.\n" \
+ " Actual: it doesn't.")
// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
// either a boolean expression or an AssertionResult. text is a textual
// representation of expression as it was passed into the EXPECT_TRUE.
#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (const ::testing::AssertionResult gtest_ar_ = \
- ::testing::AssertionResult(expression)) \
- ; \
- else \
- fail(::testing::internal::GetBoolAssertionFailureMessage(\
- gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (::testing::internal::AlwaysTrue()) { \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (const ::testing::AssertionResult gtest_ar_ = \
+ ::testing::AssertionResult(expression)) \
+ ; \
+ else \
+ fail(::testing::internal::GetBoolAssertionFailureMessage( \
+ gtest_ar_, text, #actual, #expected) \
+ .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
- GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
- if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
- goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
- } \
- } else \
- GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
- fail("Expected: " #statement " doesn't generate new fatal " \
- "failures in the current thread.\n" \
- " Actual: it does.")
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \
+ : fail("Expected: " #statement \
+ " doesn't generate new fatal " \
+ "failures in the current thread.\n" \
+ " Actual: it does.")
// Expands to the name of the class that implements the given test.
#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
test_suite_name##_##test_name##_Test
// Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \
- static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \
- "test_suite_name must not be empty"); \
- static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \
- "test_name must not be empty"); \
- class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
- : public parent_class { \
- public: \
- GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \
- ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
- GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \
- test_name)); \
- GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \
- test_name)); \
- \
- private: \
- void TestBody() override; \
- static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \
- }; \
- \
- ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \
- test_name)::test_info_ = \
- ::testing::internal::MakeAndRegisterTestInfo( \
- #test_suite_name, #test_name, nullptr, nullptr, \
- ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
- ::testing::internal::SuiteApiResolver< \
- parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \
- ::testing::internal::SuiteApiResolver< \
- parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \
- new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_( \
- test_suite_name, test_name)>); \
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \
+ static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \
+ "test_suite_name must not be empty"); \
+ static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \
+ "test_name must not be empty"); \
+ class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ : public parent_class { \
+ public: \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \
+ ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &) = delete; /* NOLINT */ \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &&) noexcept = delete; /* NOLINT */ \
+ \
+ private: \
+ void TestBody() override; \
+ static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \
+ }; \
+ \
+ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name)::test_info_ = \
+ ::testing::internal::MakeAndRegisterTestInfo( \
+ #test_suite_name, #test_name, nullptr, nullptr, \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
+ ::testing::internal::SuiteApiResolver< \
+ parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \
+ ::testing::internal::SuiteApiResolver< \
+ parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \
+ new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_( \
+ test_suite_name, test_name)>); \
void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index c2ef6e312..e7af2f904 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -27,10 +27,11 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
// Type and function utilities for implementing parameterized tests.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
@@ -46,19 +47,18 @@
#include <utility>
#include <vector>
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
#include "gtest/gtest-printers.h"
#include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
namespace testing {
// Input to a parameterized test name generator, describing a test parameter.
// Consists of the parameter value and the integer parameter index.
template <class ParamType>
struct TestParamInfo {
- TestParamInfo(const ParamType& a_param, size_t an_index) :
- param(a_param),
- index(an_index) {}
+ TestParamInfo(const ParamType& a_param, size_t an_index)
+ : param(a_param), index(an_index) {}
ParamType param;
size_t index;
};
@@ -84,8 +84,10 @@ namespace internal {
GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
CodeLocation code_location);
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
// Interface for iterating over elements provided by an implementation
// of ParamGeneratorInterface<T>.
@@ -129,8 +131,7 @@ class ParamIterator {
// ParamIterator assumes ownership of the impl_ pointer.
ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
ParamIterator& operator=(const ParamIterator& other) {
- if (this != &other)
- impl_.reset(other.impl_->Clone());
+ if (this != &other) impl_.reset(other.impl_->Clone());
return *this;
}
@@ -157,7 +158,7 @@ class ParamIterator {
private:
friend class ParamGenerator<T>;
explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
- std::unique_ptr<ParamIteratorInterface<T> > impl_;
+ std::unique_ptr<ParamIteratorInterface<T>> impl_;
};
// ParamGeneratorInterface<T> is the binary interface to access generators
@@ -179,7 +180,7 @@ class ParamGeneratorInterface {
// This class implements copy initialization semantics and the contained
// ParamGeneratorInterface<T> instance is shared among all copies
// of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
class ParamGenerator {
public:
typedef ParamIterator<T> iterator;
@@ -196,7 +197,7 @@ class ParamGenerator {
iterator end() const { return iterator(impl_->End()); }
private:
- std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+ std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
};
// Generates values from a range of two comparable values. Can be used to
@@ -207,8 +208,10 @@ template <typename T, typename IncrementT>
class RangeGenerator : public ParamGeneratorInterface<T> {
public:
RangeGenerator(T begin, T end, IncrementT step)
- : begin_(begin), end_(end),
- step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+ : begin_(begin),
+ end_(end),
+ step_(step),
+ end_index_(CalculateEndIndex(begin, end, step)) {}
~RangeGenerator() override {}
ParamIteratorInterface<T>* Begin() const override {
@@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
private:
Iterator(const Iterator& other)
: ParamIteratorInterface<T>(),
- base_(other.base_), value_(other.value_), index_(other.index_),
+ base_(other.base_),
+ value_(other.value_),
+ index_(other.index_),
step_(other.step_) {}
// No implementation - assignment is unsupported.
@@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
const IncrementT step_;
}; // class RangeGenerator::Iterator
- static int CalculateEndIndex(const T& begin,
- const T& end,
+ static int CalculateEndIndex(const T& begin, const T& end,
const IncrementT& step) {
int end_index = 0;
- for (T i = begin; i < end; i = static_cast<T>(i + step))
- end_index++;
+ for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
return end_index;
}
@@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
const int end_index_;
}; // class RangeGenerator
-
// Generates values from a pair of STL-style iterators. Used in the
// ValuesIn() function. The elements are copied from the source range
// since the source can be located on the stack, and the generator
@@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
<< "The program attempted to compare iterators "
<< "from different generators." << std::endl;
return iterator_ ==
- CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+ CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
}
private:
Iterator(const Iterator& other)
- // The explicit constructor call suppresses a false warning
- // emitted by gcc when supplied with the -Wextra option.
+ // The explicit constructor call suppresses a false warning
+ // emitted by gcc when supplied with the -Wextra option.
: ParamIteratorInterface<T>(),
base_(other.base_),
iterator_(other.iterator_) {}
@@ -394,8 +396,8 @@ template <class TestClass>
class ParameterizedTestFactory : public TestFactoryBase {
public:
typedef typename TestClass::ParamType ParamType;
- explicit ParameterizedTestFactory(ParamType parameter) :
- parameter_(parameter) {}
+ explicit ParameterizedTestFactory(ParamType parameter)
+ : parameter_(parameter) {}
Test* CreateTest() override {
TestClass::SetParam(&parameter_);
return new TestClass();
@@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase {
private:
const ParamType parameter_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+ ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+ ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
};
// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -440,7 +443,8 @@ class TestMetaFactory
}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+ TestMetaFactory(const TestMetaFactory&) = delete;
+ TestMetaFactory& operator=(const TestMetaFactory&) = delete;
};
// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase {
ParameterizedTestSuiteInfoBase() {}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+ ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+ delete;
+ ParameterizedTestSuiteInfoBase& operator=(
+ const ParameterizedTestSuiteInfoBase&) = delete;
};
// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
test_it != tests_.end(); ++test_it) {
std::shared_ptr<TestInfo> test_info = *test_it;
for (typename InstantiationContainer::iterator gen_it =
- instantiations_.begin(); gen_it != instantiations_.end();
- ++gen_it) {
+ instantiations_.begin();
+ gen_it != instantiations_.end(); ++gen_it) {
const std::string& instantiation_name = gen_it->name;
ParamGenerator<ParamType> generator((*gen_it->generator)());
ParamNameGeneratorFunc* name_func = gen_it->name_func;
@@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
int line = gen_it->line;
std::string test_suite_name;
- if ( !instantiation_name.empty() )
+ if (!instantiation_name.empty())
test_suite_name = instantiation_name + "/";
test_suite_name += test_info->test_suite_base_name;
@@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
Message test_name_stream;
- std::string param_name = name_func(
- TestParamInfo<ParamType>(*param_it, i));
+ std::string param_name =
+ name_func(TestParamInfo<ParamType>(*param_it, i));
GTEST_CHECK_(IsValidParamName(param_name))
<< "Parameterized test name '" << param_name
- << "' is invalid, in " << file
- << " line " << line << std::endl;
+ << "' is invalid, in " << file << " line " << line << std::endl;
GTEST_CHECK_(test_param_names.count(param_name) == 0)
- << "Duplicate parameterized test name '" << param_name
- << "', in " << file << " line " << line << std::endl;
+ << "Duplicate parameterized test name '" << param_name << "', in "
+ << file << " line " << line << std::endl;
test_param_names.insert(param_name);
@@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
test_info->test_meta_factory->CreateTestFactory(*param_it));
} // for param_it
- } // for gen_it
- } // for test_it
+ } // for gen_it
+ } // for test_it
if (!generated_instantiations) {
// There are no generaotrs, or they all generate nothing ...
InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
!tests_.empty());
}
- } // RegisterTests
+ } // RegisterTests
private:
// LocalTestInfo structure keeps information about a single test registered
@@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
const std::string test_suite_base_name;
const std::string test_base_name;
- const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+ const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
const CodeLocation code_location;
};
- using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+ using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
// Records data received from INSTANTIATE_TEST_SUITE_P macros:
// <Instantiation name, Sequence generator creation function,
// Name generator function, Source file, Source line>
struct InstantiationInfo {
- InstantiationInfo(const std::string &name_in,
- GeneratorCreationFunc* generator_in,
- ParamNameGeneratorFunc* name_func_in,
- const char* file_in,
- int line_in)
- : name(name_in),
- generator(generator_in),
- name_func(name_func_in),
- file(file_in),
- line(line_in) {}
-
- std::string name;
- GeneratorCreationFunc* generator;
- ParamNameGeneratorFunc* name_func;
- const char* file;
- int line;
+ InstantiationInfo(const std::string& name_in,
+ GeneratorCreationFunc* generator_in,
+ ParamNameGeneratorFunc* name_func_in, const char* file_in,
+ int line_in)
+ : name(name_in),
+ generator(generator_in),
+ name_func(name_func_in),
+ file(file_in),
+ line(line_in) {}
+
+ std::string name;
+ GeneratorCreationFunc* generator;
+ ParamNameGeneratorFunc* name_func;
+ const char* file;
+ int line;
};
typedef ::std::vector<InstantiationInfo> InstantiationContainer;
static bool IsValidParamName(const std::string& name) {
// Check for empty string
- if (name.empty())
- return false;
+ if (name.empty()) return false;
// Check for invalid characters
for (std::string::size_type index = 0; index < name.size(); ++index) {
- if (!IsAlNum(name[index]) && name[index] != '_')
- return false;
+ if (!IsAlNum(name[index]) && name[index] != '_') return false;
}
return true;
@@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
TestInfoContainer tests_;
InstantiationContainer instantiations_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+ ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+ ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+ delete;
}; // class ParameterizedTestSuiteInfo
// Legacy API is deprecated but still available
@@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry {
// type we are looking for, so we downcast it to that type
// without further checks.
typed_test_info = CheckedDowncastToActualType<
- ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+ ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
}
break;
}
@@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry {
TestSuiteInfoContainer test_suite_infos_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+ ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+ delete;
+ ParameterizedTestSuiteRegistry& operator=(
+ const ParameterizedTestSuiteRegistry&) = delete;
};
// Keep track of what type-parameterized test suite are defined and
@@ -836,7 +844,8 @@ class CartesianProductGenerator
: public ParamIteratorInterface<ParamType> {
public:
IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
- const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+ const std::tuple<ParamGenerator<T>...>& generators,
+ bool is_end)
: base_(base),
begin_(std::get<I>(generators).begin()...),
end_(std::get<I>(generators).end()...),
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index dd845915e..f025db76a 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -26,7 +26,7 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file defines the GTEST_OS_* macro.
@@ -37,70 +37,72 @@
// Determines the platform on which Google Test is compiled.
#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-# define GTEST_OS_WINDOWS_MINGW 1
-# define GTEST_OS_WINDOWS 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-# define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-# include <winapifamily.h>
-# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-# define GTEST_OS_WINDOWS_DESKTOP 1
-# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-# define GTEST_OS_WINDOWS_PHONE 1
-# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-# define GTEST_OS_WINDOWS_RT 1
-# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-# define GTEST_OS_WINDOWS_PHONE 1
-# define GTEST_OS_WINDOWS_TV_TITLE 1
-# else
- // WINAPI_FAMILY defined but no known partition matched.
- // Default to desktop.
-# define GTEST_OS_WINDOWS_DESKTOP 1
-# endif
-# else
-# define GTEST_OS_WINDOWS_DESKTOP 1
-# endif // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif // _WIN32_WCE
#elif defined __OS2__
-# define GTEST_OS_OS2 1
+#define GTEST_OS_OS2 1
#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# include <TargetConditionals.h>
-# if TARGET_OS_IPHONE
-# define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
#elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
+#define GTEST_OS_DRAGONFLY 1
#elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
#elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
+#define GTEST_OS_GNU_KFREEBSD 1
#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-# define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
#elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
#elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
#elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
#elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
+#define GTEST_OS_NETBSD 1
#elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
#elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
#elif defined(__HAIKU__)
#define GTEST_OS_HAIKU 1
#elif defined ESP8266
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 0953a781c..0003d2765 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -26,7 +26,7 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// Low-level types and utilities for porting Google Test to various
// platforms. All macros ending with _ and symbols defined in an
// internal namespace are subject to change without notice. Code
@@ -38,7 +38,9 @@
// files are expected to #include this. Therefore, it cannot #include
// any other Google Test header.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
@@ -116,6 +118,7 @@
// GTEST_OS_DRAGONFLY - DragonFlyBSD
// GTEST_OS_FREEBSD - FreeBSD
// GTEST_OS_FUCHSIA - Fuchsia
+// GTEST_OS_GNU_HURD - GNU/Hurd
// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
// GTEST_OS_HAIKU - Haiku
// GTEST_OS_HPUX - HP-UX
@@ -167,7 +170,7 @@
// GTEST_HAS_TYPED_TEST - typed tests
// GTEST_HAS_TYPED_TEST_P - type-parameterized tests
// GTEST_IS_THREADSAFE - Google Test is thread-safe.
-// GOOGLETEST_CM0007 DO NOT DELETE
+// GTEST_USES_RE2 - the RE2 regular expression library is used
// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with
// GTEST_HAS_POSIX_RE (see above) which users can
// define themselves.
@@ -190,10 +193,6 @@
// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a
// variable don't have to be used.
-// GTEST_DISALLOW_ASSIGN_ - disables copy operator=.
-// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-// GTEST_DISALLOW_MOVE_ASSIGN_ - disables move operator=.
-// GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used.
// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
// suppressed (constant conditional).
@@ -217,11 +216,13 @@
// - synchronization primitives.
//
// Regular expressions:
-// RE - a simple regular expression class using the POSIX
-// Extended Regular Expression syntax on UNIX-like platforms
-// GOOGLETEST_CM0008 DO NOT DELETE
-// or a reduced regular exception syntax on other
-// platforms, including Windows.
+// RE - a simple regular expression class using
+// 1) the RE2 syntax on all platforms when built with RE2
+// and Abseil as dependencies
+// 2) the POSIX Extended Regular Expression syntax on
+// UNIX-like platforms,
+// 3) A reduced regular exception syntax on other platforms,
+// including Windows.
// Logging:
// GTEST_LOG_() - logs messages at the specified severity level.
// LogToStderr() - directs all log messages to stderr.
@@ -241,8 +242,6 @@
// BiggestInt - the biggest signed integer type.
//
// Command-line utilities:
-// GTEST_DECLARE_*() - declares a flag.
-// GTEST_DEFINE_*() - defines a flag.
// GetInjectableArgvs() - returns the command line as a vector of strings.
//
// Environment variable utilities:
@@ -263,48 +262,55 @@
#include <string.h>
#include <cerrno>
+// #include <condition_variable> // Guarded by GTEST_IS_THREADSAFE below
#include <cstdint>
+#include <iostream>
#include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex> // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
#include <type_traits>
+#include <vector>
#ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/stat.h>
+#include <sys/types.h>
#endif // !_WIN32_WCE
#if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
#endif
-#include <iostream> // NOLINT
-#include <locale>
-#include <memory>
-#include <string> // NOLINT
-#include <tuple>
-#include <vector> // NOLINT
-
#include "gtest/internal/custom/gtest-port.h"
#include "gtest/internal/gtest-port-arch.h"
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
#if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
#endif // !defined(GTEST_DEV_EMAIL_)
#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
// Determines the version of gcc that is used to compile this.
#ifdef __GNUC__
// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
- (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#endif // __GNUC__
// Macros for disabling Microsoft Visual C++ warnings.
@@ -313,41 +319,37 @@
// /* code that triggers warnings C4800 and C4385 */
// GTEST_DISABLE_MSC_WARNINGS_POP_()
#if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
- __pragma(warning(push)) \
- __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_() \
- __pragma(warning(pop))
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+ __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
#else
// Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
#endif
// Clang on Windows does not understand MSVC's pragma warning.
// We need clang-specific way to disable function deprecation warning.
#ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
- _Pragma("clang diagnostic push") \
- _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
- _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
- _Pragma("clang diagnostic pop")
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+ _Pragma("clang diagnostic push") \
+ _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+ _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
#else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
- GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
- GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
#endif
// Brings in definitions for functions used in the testing::internal::posix
// namespace (read, write, close, chdir, isatty, stat). We do not currently
// use them on Windows Mobile.
#if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-# include <direct.h>
-# include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
// In order to avoid having to include <windows.h>, use forward declaration
#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// This assumes that non-Windows OSes provide unistd.h. For OSes where this
// is not the case, we need to include headers that provide the functions
// mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <strings.h>
+#include <unistd.h>
#endif // GTEST_OS_WINDOWS
#if GTEST_OS_LINUX_ANDROID
// Used to define __ANDROID_API__ matching the target NDK API level.
-# include <android/api-level.h> // NOLINT
+#include <android/api-level.h> // NOLINT
#endif
// Defines this to true if and only if Google Test can use POSIX regular
// expressions.
#ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
// On Android, <regex.h> is only available starting with Gingerbread.
-# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
-# endif
+#endif
#endif
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
#elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise. We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h> // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows. Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
+#include <regex.h> // NOLINT
+#define GTEST_USES_POSIX_RE 1
#else
-
-// <regex.h> may not be available on this platform. Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif // GTEST_USES_PCRE
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
#ifndef GTEST_HAS_EXCEPTIONS
// The user didn't tell us whether exceptions are enabled, so we need
// to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-# define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
// macro to enable exceptions, so we'll do the same.
// Assumes that exceptions are enabled by default.
-# ifndef _HAS_EXCEPTIONS
-# define _HAS_EXCEPTIONS 1
-# endif // _HAS_EXCEPTIONS
-# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
// there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// cleanups prior to that. To reliably check for C++ exception availability with
// clang, check for
// __EXCEPTIONS && __has_feature(cxx_exceptions).
-# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-# define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
// Sun Pro CC supports exceptions. However, there is no compile-time way of
// detecting whether they are enabled or not. Therefore, we assume that
// they are enabled unless the user tells us otherwise.
-# define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-# define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
// Exception handling is in effect by default in HP aCC compiler. It has to
// be turned of by +noeh compiler option if desired.
-# define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
// For other compilers, we assume exceptions are disabled to be
// conservative.
-# define GTEST_HAS_EXCEPTIONS 0
-# endif // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif // defined(_MSC_VER) || defined(__BORLANDC__)
#endif // GTEST_HAS_EXCEPTIONS
#ifndef GTEST_HAS_STD_WSTRING
@@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// The user didn't tell us whether RTTI is enabled, so we need to
// figure it out.
-# ifdef _MSC_VER
+#ifdef _MSC_VER
#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled.
-# define GTEST_HAS_RTTI 1
-# else
-# define GTEST_HAS_RTTI 0
-# endif
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
// enabled.
-# elif defined(__GNUC__)
+#elif defined(__GNUC__)
-# ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
// When building against STLport with the Android NDK and with
// -frtti -fno-exceptions, the build fails at link time with undefined
// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
// so disable RTTI when detected.
-# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
- !defined(__EXCEPTIONS)
-# define GTEST_HAS_RTTI 0
-# else
-# define GTEST_HAS_RTTI 1
-# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-# else
-# define GTEST_HAS_RTTI 0
-# endif // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif // __GXX_RTTI
// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
// first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
-# define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
// both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
-# ifdef __RTTI_ALL__
-# define GTEST_HAS_RTTI 1
-# else
-# define GTEST_HAS_RTTI 0
-# endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
-# else
+#else
// For all other compilers, we assume RTTI is enabled.
-# define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
-# endif // _MSC_VER
+#endif // _MSC_VER
#endif // GTEST_HAS_RTTI
// It's this header's responsibility to #include <typeinfo> when RTTI
// is enabled.
#if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
#endif
// Determines whether Google Test can use the pthreads library.
@@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
(GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \
- GTEST_OS_HAIKU)
+ GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
#endif // GTEST_HAS_PTHREAD
#if GTEST_HAS_PTHREAD
// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
// true.
-# include <pthread.h> // NOLINT
+#include <pthread.h> // NOLINT
// For timespec and nanosleep, used below.
-# include <time.h> // NOLINT
+#include <time.h> // NOLINT
#endif
// Determines whether clone(2) is supported.
@@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
#ifndef GTEST_HAS_CLONE
// The user didn't tell us, so we need to figure it out.
-# if GTEST_OS_LINUX && !defined(__ia64__)
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
// On Android, clone() became available at different API levels for each 32-bit
// architecture.
-# if defined(__LP64__) || \
- (defined(__arm__) && __ANDROID_API__ >= 9) || \
- (defined(__mips__) && __ANDROID_API__ >= 12) || \
- (defined(__i386__) && __ANDROID_API__ >= 17)
-# define GTEST_HAS_CLONE 1
-# else
-# define GTEST_HAS_CLONE 0
-# endif
-# else
-# define GTEST_HAS_CLONE 1
-# endif
-# else
-# define GTEST_HAS_CLONE 0
-# endif // GTEST_OS_LINUX && !defined(__ia64__)
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+ (defined(__mips__) && __ANDROID_API__ >= 12) || \
+ (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif // GTEST_OS_LINUX && !defined(__ia64__)
#endif // GTEST_HAS_CLONE
@@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// platforms except known mobile ones.
#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
-# define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-# define GTEST_HAS_STREAM_REDIRECTION 1
-# endif // !GTEST_OS_WINDOWS_MOBILE
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif // !GTEST_OS_WINDOWS_MOBILE
#endif // GTEST_HAS_STREAM_REDIRECTION
// Determines whether to support death tests.
@@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
(GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \
GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
- GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
+ GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \
+ GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
#endif
// Determines whether to support type-driven tests.
@@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// Sun Pro CC, IBM Visual Age, and HP aCC support.
#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
#endif
// Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// Determines whether test results can be streamed to a socket.
#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
- GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
+ GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \
+ GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
#endif
// Defines some utility macros.
@@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
//
// The "switch (0) case 0:" idiom is used to suppress this.
#ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
#else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ switch (0) \
+ case 0: \
+ default: // NOLINT
#endif
// Use this annotation at the end of a struct/class definition to
@@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// Also use it after a variable or parameter declaration to tell the
// compiler the variable/parameter does not have to be used.
#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
#elif defined(__clang__)
-# if __has_attribute(unused)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
#endif
#ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
#endif
// Use this annotation before a function that takes a printf format string.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
+#if defined(__MINGW_PRINTF_FORMAT)
// MinGW has two different printf implementations. Ensure the format macro
// matches the selected implementation. See
// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
- __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
- first_to_check)))
-# else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
- __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+ __attribute__(( \
+ __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
#else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
#endif
-
-
-// A macro to disallow copy operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
- type& operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
- type(type const&) = delete; \
- type& operator=(type const&) = delete
-
-// A macro to disallow move operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
- type& operator=(type &&) noexcept = delete
-
-// A macro to disallow move constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
- type(type&&) noexcept = delete; \
- type& operator=(type&&) noexcept = delete
// Tell the compiler to warn about unused return values for functions declared
// with this macro. The macro should be used on function declarations
@@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
//
// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
#else
-# define GTEST_MUST_USE_RESULT_
+#define GTEST_MUST_USE_RESULT_
#endif // __GNUC__ && !COMPILER_ICC
// MS C++ compiler emits warning when a conditional expression is compile time
@@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
// while (true) {
// GTEST_INTENTIONAL_CONST_COND_POP_()
// }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
- GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
- GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
// Determine whether the compiler supports Microsoft's Structured Exception
// Handling. This is supported by several Windows compilers but generally
@@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
#ifndef GTEST_HAS_SEH
// The user didn't tell us, so we need to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
// These two compilers are known to support SEH.
-# define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
// Assume no SEH.
-# define GTEST_HAS_SEH 0
-# endif
+#define GTEST_HAS_SEH 0
+#endif
#endif // GTEST_HAS_SEH
@@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
#endif // GTEST_IS_THREADSAFE
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable> // NOLINT
+#include <mutex> // NOLINT
+#endif // GTEST_IS_THREADSAFE
+
// GTEST_API_ qualifies all symbols that must be exported. The definitions below
// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
// gtest/internal/custom/gtest-port.h
#ifndef GTEST_API_
#ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-# define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-# define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
#elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
+#define GTEST_API_ __attribute__((visibility("default")))
#endif // _MSC_VER
#endif // GTEST_API_
#ifndef GTEST_API_
-# define GTEST_API_
+#define GTEST_API_
#endif // GTEST_API_
#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
#endif // GTEST_DEFAULT_DEATH_TEST_STYLE
#ifdef __GNUC__
// Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
#else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+ __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
#endif
// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
#if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-# define GTEST_HAS_CXXABI_H_ 1
-# else
-# define GTEST_HAS_CXXABI_H_ 0
-# endif
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
#endif
// A function level attribute to disable checking for use of uninitialized
// memory when built with MemorySanitizer.
#if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
- __attribute__((no_sanitize_memory))
-# else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif // __has_feature(memory_sanitizer)
#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
#endif // __clang__
// A function level attribute to disable AddressSanitizer instrumentation.
#if defined(__clang__)
-# if __has_feature(address_sanitizer)
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
- __attribute__((no_sanitize_address))
-# else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+ __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif // __has_feature(address_sanitizer)
#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
#endif // __clang__
// A function level attribute to disable HWAddressSanitizer instrumentation.
#if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
- __attribute__((no_sanitize("hwaddress")))
-# else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif // __has_feature(hwaddress_sanitizer)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+ __attribute__((no_sanitize("hwaddress")))
#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
#endif // __clang__
// A function level attribute to disable ThreadSanitizer instrumentation.
#if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
- __attribute__((no_sanitize_thread))
-# else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif // __has_feature(thread_sanitizer)
#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
#endif // __clang__
namespace testing {
@@ -867,25 +853,37 @@ namespace internal {
// Secret object, which is what we want.
class Secret;
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-// names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
// A helper for suppressing warnings on constant condition. It just
// returns 'condition'.
GTEST_API_ bool IsTrue(bool condition);
// Defines RE.
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+ RE(absl::string_view regex) : regex_(regex) {} // NOLINT
+ RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT
+ RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT
+ RE(const RE& other) : RE(other.pattern()) {}
+
+ const std::string& pattern() const { return regex_.pattern(); }
+
+ static bool FullMatch(absl::string_view str, const RE& re) {
+ return RE2::FullMatch(str, re.regex_);
+ }
+ static bool PartialMatch(absl::string_view str, const RE& re) {
+ return RE2::PartialMatch(str, re.regex_);
+ }
+
+ private:
+ RE2 regex_;
+};
+
#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
// A simple C++ wrapper for <regex.h>. It uses the POSIX Extended
@@ -924,19 +922,19 @@ class GTEST_API_ RE {
const char* pattern_;
bool is_valid_;
-# if GTEST_USES_POSIX_RE
+#if GTEST_USES_POSIX_RE
regex_t full_regex_; // For FullMatch().
regex_t partial_regex_; // For PartialMatch().
-# else // GTEST_USES_SIMPLE_RE
+#else // GTEST_USES_SIMPLE_RE
const char* full_pattern_; // For FullMatch();
-# endif
+#endif
};
-#endif // GTEST_USES_PCRE
+#endif // ::testing::internal::RE implementation
// Formats a source file path and a line number as they would appear
// in an error message from the compiler used to compile this code.
@@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
// LogToStderr() - directs all log messages to stderr.
// FlushInfoLog() - flushes informational log messages.
-enum GTestLogSeverity {
- GTEST_INFO,
- GTEST_WARNING,
- GTEST_ERROR,
- GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
// Formats log entry severity, provides a stream object for streaming the
// log message, and terminates the message with a newline when going out of
@@ -976,14 +969,16 @@ class GTEST_API_ GTestLog {
private:
const GTestLogSeverity severity_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+ GTestLog(const GTestLog&) = delete;
+ GTestLog& operator=(const GTestLog&) = delete;
};
#if !defined(GTEST_LOG_)
-# define GTEST_LOG_(severity) \
- ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
- __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity) \
+ ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+ __FILE__, __LINE__) \
+ .GetStream()
inline void LogToStderr() {}
inline void FlushInfoLog() { fflush(nullptr); }
@@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); }
//
// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
// is not satisfied.
-// Synopsys:
+// Synopsis:
// GTEST_CHECK_(boolean_condition);
// or
// GTEST_CHECK_(boolean_condition) << "Additional message";
@@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
// condition itself, plus additional message streamed into it, if any,
// and then it aborts the program. It aborts the program irrespective of
// whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
- GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
- if (::testing::internal::IsTrue(condition)) \
- ; \
- else \
- GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::IsTrue(condition)) \
+ ; \
+ else \
+ GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
#endif // !defined(GTEST_CHECK_)
// An all-mode assert to verify that the given POSIX-style function
@@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); }
// in {} if you need to use it as the only statement in an 'if'
// branch.
#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
- if (const int gtest_error = (posix_call)) \
- GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
- << gtest_error
+ if (const int gtest_error = (posix_call)) \
+ GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
// Transforms "T" into "const T&" according to standard reference collapsing
// rules (this is only needed as a backport for C++98 compilers that do not
@@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); }
// Note that the non-const reference will not have "const" added. This is
// standard, and necessary so that "T" can always bind to "const T&".
template <typename T>
-struct ConstRef { typedef const T& type; };
+struct ConstRef {
+ typedef const T& type;
+};
template <typename T>
-struct ConstRef<T&> { typedef T& type; };
+struct ConstRef<T&> {
+ typedef T& type;
+};
// The argument T must depend on some template parameters.
#define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1050,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; };
// const Foo*). When you use ImplicitCast_, the compiler checks that
// the cast is safe. Such explicit ImplicitCast_s are necessary in
// surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
+// instead of an argument type convertible to a target type.
//
// The syntax for using ImplicitCast_ is the same as for static_cast:
//
@@ -1063,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; };
// This relatively ugly name is intentional. It prevents clashes with
// similar functions users may have (e.g., implicit_cast). The internal
// namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+ return x;
+}
// When you upcast (that is, cast a pointer from type Foo to type
// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; }
// This relatively ugly name is intentional. It prevents clashes with
// similar functions users may have (e.g., down_cast). The internal
// namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From> // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) { // so we only accept pointers
+template <typename To, typename From> // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) { // so we only accept pointers
// Ensures that To is a sub-type of From *. This test is here only
// for compile-time type checking, and has no overhead in an
// optimized build at run-time, as it will be optimized away
// completely.
GTEST_INTENTIONAL_CONST_COND_PUSH_()
if (false) {
- GTEST_INTENTIONAL_CONST_COND_POP_()
- const To to = nullptr;
- ::testing::internal::ImplicitCast_<From*>(to);
+ GTEST_INTENTIONAL_CONST_COND_POP_()
+ const To to = nullptr;
+ ::testing::internal::ImplicitCast_<From*>(to);
}
#if GTEST_HAS_RTTI
@@ -1162,71 +1162,8 @@ void ClearInjectableArgvs();
// Defines synchronization primitives.
#if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds. This function is only for testing
-// Google Test's own constructs. Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
- const timespec time = {
- 0, // 0 seconds.
- n * 1000L * 1000L, // And n ms.
- };
- nanosleep(&time, nullptr);
-}
-# endif // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified. Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
- Notification() : notified_(false) {
- GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
- }
- ~Notification() {
- pthread_mutex_destroy(&mutex_);
- }
-
- // Notifies all threads created with this notification to start. Must
- // be called from the controller thread.
- void Notify() {
- pthread_mutex_lock(&mutex_);
- notified_ = true;
- pthread_mutex_unlock(&mutex_);
- }
-
- // Blocks until the controller thread notifies. Must be called from a test
- // thread.
- void WaitForNotification() {
- for (;;) {
- pthread_mutex_lock(&mutex_);
- const bool notified = notified_;
- pthread_mutex_unlock(&mutex_);
- if (notified)
- break;
- SleepMilliseconds(10);
- }
- }
-
- private:
- pthread_mutex_t mutex_;
- bool notified_;
-
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
+#if GTEST_OS_WINDOWS
// Provides leak-safe Windows kernel handle ownership.
// Used in death tests and in threading support.
class GTEST_API_ AutoHandle {
@@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle {
Handle handle_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+ AutoHandle(const AutoHandle&) = delete;
+ AutoHandle& operator=(const AutoHandle&) = delete;
};
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
// Allows a controller thread to pause execution of newly created
// threads until notified. Instances of this class must be created
@@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle {
//
// This class is only for testing Google Test's own constructs. Do not
// use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
class GTEST_API_ Notification {
public:
- Notification();
- void Notify();
- void WaitForNotification();
+ Notification() : notified_(false) {}
+ Notification(const Notification&) = delete;
+ Notification& operator=(const Notification&) = delete;
- private:
- AutoHandle event_;
+ // Notifies all threads created with this notification to start. Must
+ // be called from the controller thread.
+ void Notify() {
+ std::lock_guard<std::mutex> lock(mu_);
+ notified_ = true;
+ cv_.notify_all();
+ }
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+ // Blocks until the controller thread notifies. Must be called from a test
+ // thread.
+ void WaitForNotification() {
+ std::unique_lock<std::mutex> lock(mu_);
+ cv_.wait(lock, [this]() { return notified_; });
+ }
+
+ private:
+ std::mutex mu_;
+ std::condition_variable cv_;
+ bool notified_;
};
-# endif // GTEST_HAS_NOTIFICATION_
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+#endif // GTEST_HAS_NOTIFICATION_
// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
// defined, but we don't want to use MinGW's pthreads implementation, which
// has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
// Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase {
// finished.
pthread_t thread_; // The native thread object.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+ ThreadWithParam(const ThreadWithParam&) = delete;
+ ThreadWithParam& operator=(const ThreadWithParam&) = delete;
};
-# endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
- // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+ // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
// Mutex and ThreadLocal have already been imported into the namespace.
// Nothing to do here.
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
// Mutex implements mutex on Windows platforms. It is used in conjunction
// with class MutexLock:
@@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex {
long critical_section_init_phase_; // NOLINT
GTEST_CRITICAL_SECTION* critical_section_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+ Mutex(const Mutex&) = delete;
+ Mutex& operator=(const Mutex&) = delete;
};
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
- extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+ extern ::testing::internal::Mutex mutex
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
- ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+ ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
// We cannot name this class MutexLock because the ctor declaration would
// conflict with a macro named MutexLock, which is defined on some
@@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex {
// "MutexLock l(&mu)". Hence the typedef trick below.
class GTestMutexLock {
public:
- explicit GTestMutexLock(Mutex* mutex)
- : mutex_(mutex) { mutex_->Lock(); }
+ explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
~GTestMutexLock() { mutex_->Unlock(); }
private:
Mutex* const mutex_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+ GTestMutexLock(const GTestMutexLock&) = delete;
+ GTestMutexLock& operator=(const GTestMutexLock&) = delete;
};
typedef GTestMutexLock MutexLock;
@@ -1468,7 +1434,8 @@ class ThreadLocalBase {
virtual ~ThreadLocalBase() {}
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+ ThreadLocalBase(const ThreadLocalBase&) = delete;
+ ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
};
// Maps a thread to a set of ThreadLocals that have values instantiated on that
@@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase {
virtual void Run() = 0;
};
- ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+ ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
virtual ~ThreadWithParamBase();
private:
@@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase {
typedef void UserThreadFunc(T);
ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
- : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
- }
+ : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
virtual ~ThreadWithParam() {}
private:
class RunnableImpl : public Runnable {
public:
- RunnableImpl(UserThreadFunc* func, T param)
- : func_(func),
- param_(param) {
- }
+ RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
virtual ~RunnableImpl() {}
- virtual void Run() {
- func_(param_);
- }
+ virtual void Run() { func_(param_); }
private:
UserThreadFunc* const func_;
const T param_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+ RunnableImpl(const RunnableImpl&) = delete;
+ RunnableImpl& operator=(const RunnableImpl&) = delete;
};
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+ ThreadWithParam(const ThreadWithParam&) = delete;
+ ThreadWithParam& operator=(const ThreadWithParam&) = delete;
};
// Implements thread-local storage on Windows systems.
@@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase {
explicit ThreadLocal(const T& value)
: default_factory_(new InstanceValueHolderFactory(value)) {}
- ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+ ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
T* pointer() { return GetOrCreateValue(); }
const T* pointer() const { return GetOrCreateValue(); }
@@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase {
private:
T value_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+ ValueHolder(const ValueHolder&) = delete;
+ ValueHolder& operator=(const ValueHolder&) = delete;
};
-
T* GetOrCreateValue() const {
return static_cast<ValueHolder*>(
- ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+ ThreadLocalRegistry::GetValueOnCurrentThread(this))
+ ->pointer();
}
- virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+ ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
return default_factory_->MakeNewHolder();
}
@@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase {
virtual ValueHolder* MakeNewHolder() const = 0;
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+ ValueHolderFactory(const ValueHolderFactory&) = delete;
+ ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
};
class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase {
ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+ DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+ DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+ delete;
};
class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase {
private:
const T value_; // The value for each thread.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+ InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+ InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+ delete;
};
std::unique_ptr<ValueHolderFactory> default_factory_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+ ThreadLocal(const ThreadLocal&) = delete;
+ ThreadLocal& operator=(const ThreadLocal&) = delete;
};
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
// MutexBase and Mutex implement mutex on pthreads-based platforms.
class MutexBase {
@@ -1687,8 +1657,8 @@ class MutexBase {
};
// Forward-declares a static mutex.
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
- extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+ extern ::testing::internal::MutexBase mutex
// Defines and statically (i.e. at link time) initializes a static mutex.
// The initialization list here does not explicitly initialize each field,
@@ -1707,12 +1677,11 @@ class Mutex : public MutexBase {
GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
has_owner_ = false;
}
- ~Mutex() {
- GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
- }
+ ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+ Mutex(const Mutex&) = delete;
+ Mutex& operator=(const Mutex&) = delete;
};
// We cannot name this class MutexLock because the ctor declaration would
@@ -1722,15 +1691,15 @@ class Mutex : public MutexBase {
// "MutexLock l(&mu)". Hence the typedef trick below.
class GTestMutexLock {
public:
- explicit GTestMutexLock(MutexBase* mutex)
- : mutex_(mutex) { mutex_->Lock(); }
+ explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
~GTestMutexLock() { mutex_->Unlock(); }
private:
MutexBase* const mutex_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+ GTestMutexLock(const GTestMutexLock&) = delete;
+ GTestMutexLock& operator=(const GTestMutexLock&) = delete;
};
typedef GTestMutexLock MutexLock;
@@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal {
private:
T value_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+ ValueHolder(const ValueHolder&) = delete;
+ ValueHolder& operator=(const ValueHolder&) = delete;
};
static pthread_key_t CreateKey() {
@@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal {
virtual ValueHolder* MakeNewHolder() const = 0;
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+ ValueHolderFactory(const ValueHolderFactory&) = delete;
+ ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
};
class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal {
ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+ DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+ DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+ delete;
};
class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal {
private:
const T value_; // The value for each thread.
- GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+ InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+ InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+ delete;
};
// A key pthreads uses for looking up per-thread values.
const pthread_key_t key_;
std::unique_ptr<ValueHolderFactory> default_factory_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+ ThreadLocal(const ThreadLocal&) = delete;
+ ThreadLocal& operator=(const ThreadLocal&) = delete;
};
-# endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
#else // GTEST_IS_THREADSAFE
@@ -1868,10 +1844,10 @@ class Mutex {
void AssertHeld() const {}
};
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
extern ::testing::internal::Mutex mutex
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
// We cannot name this class MutexLock because the ctor declaration would
// conflict with a macro named MutexLock, which is defined on some
@@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal {
const T* pointer() const { return &value_; }
const T& get() const { return value_; }
void set(const T& value) { value_ = value; }
+
private:
T value_;
};
@@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal {
GTEST_API_ size_t GetThreadCount();
#if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
#else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
#endif // GTEST_OS_WINDOWS
// Utilities for char.
@@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) {
inline std::string StripTrailingSpaces(std::string str) {
std::string::iterator it = str.end();
- while (it != str.begin() && IsSpace(*--it))
- it = str.erase(it);
+ while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
return str;
}
@@ -1986,36 +1962,35 @@ namespace posix {
typedef struct _stat StatStruct;
-# ifdef __BORLANDC__
+#ifdef __BORLANDC__
inline int DoIsATTY(int fd) { return isatty(fd); }
inline int StrCaseCmp(const char* s1, const char* s2) {
return stricmp(s1, s2);
}
inline char* StrDup(const char* src) { return strdup(src); }
-# else // !__BORLANDC__
-# if GTEST_OS_WINDOWS_MOBILE
+#else // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+ GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
inline int DoIsATTY(int /* fd */) { return 0; }
-# else
+#else
inline int DoIsATTY(int fd) { return _isatty(fd); }
-# endif // GTEST_OS_WINDOWS_MOBILE
+#endif // GTEST_OS_WINDOWS_MOBILE
inline int StrCaseCmp(const char* s1, const char* s2) {
return _stricmp(s1, s2);
}
inline char* StrDup(const char* src) { return _strdup(src); }
-# endif // __BORLANDC__
+#endif // __BORLANDC__
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
// time and thus not defined there.
-# else
+#else
inline int FileNo(FILE* file) { return _fileno(file); }
inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
- return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif // GTEST_OS_WINDOWS_MOBILE
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif // GTEST_OS_WINDOWS_MOBILE
#elif GTEST_OS_ESP8266
typedef struct stat StatStruct;
@@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) {
std::wstring wide_path = converter.from_bytes(path);
std::wstring wide_mode = converter.from_bytes(mode);
return _wfopen(wide_path.c_str(), wide_mode.c_str());
-#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
return fopen(path, mode);
#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
}
#if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
return freopen(path, mode, stream);
}
inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
@@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
// snprintf is a variadic function.
#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
// MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
- _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+ _snprintf_s(buffer, size, size, format, __VA_ARGS__)
#elif defined(_MSC_VER)
// Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
+#define GTEST_SNPRINTF_ _snprintf
#else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
#endif
// The biggest signed integer type the compiler supports.
@@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t; // Represents time in milliseconds.
// Macro for referencing flags.
#if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
#endif // !defined(GTEST_FLAG)
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+ ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+ ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+ ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
// Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
- GTEST_API_ extern std::int32_t GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
- GTEST_API_ extern ::std::string GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) \
+ ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+ ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+ ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+ (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else // GTEST_HAS_ABSL
// Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
- GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
- GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
- GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
-#endif // !defined(GTEST_DECLARE_bool_)
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) \
+ namespace testing { \
+ GTEST_API_ extern bool GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name) \
+ namespace testing { \
+ GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name) \
+ namespace testing { \
+ GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif // GTEST_HAS_ABSL
// Thread annotations
#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
// Parses 'str' for a 32-bit signed integer. If successful, writes the result
@@ -2308,6 +2330,7 @@ namespace testing {
namespace internal {
template <typename T>
using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
} // namespace internal
} // namespace testing
#else
@@ -2321,6 +2344,7 @@ namespace testing {
namespace internal {
template <typename T>
using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
} // namespace internal
} // namespace testing
// The case where absl is configured NOT to alias std::optional is not
@@ -2332,7 +2356,7 @@ using Optional = ::std::optional<T>;
#if GTEST_HAS_ABSL
// Always use absl::string_view for Matcher<> specializations if googletest
// is built with absl support.
-# define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
#include "absl/strings/string_view.h"
namespace testing {
namespace internal {
@@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view;
} // namespace internal
} // namespace testing
#else
-# ifdef __has_include
-# if __has_include(<string_view>) && __cplusplus >= 201703L
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
// Otherwise for C++17 and higher use std::string_view for Matcher<>
// specializations.
-# define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
#include <string_view>
namespace testing {
namespace internal {
@@ -2353,8 +2377,8 @@ using StringView = ::std::string_view;
} // namespace testing
// The case where absl is configured NOT to alias std::string_view is not
// supported.
-# endif // __has_include(<string_view>) && __cplusplus >= 201703L
-# endif // __has_include
+#endif // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif // __has_include
#endif // GTEST_HAS_ABSL
#if GTEST_HAS_ABSL
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 10f774f96..cca2e1f2a 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -26,7 +26,7 @@
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
// The Google C++ Testing and Mocking Framework (Google Test)
//
// This header file declares the String class and functions used internally by
@@ -36,17 +36,20 @@
// This header file is #included by gtest-internal.h.
// It should not be #included by other files.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
#ifdef __BORLANDC__
// string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
#endif
#include <string.h>
+
#include <cstdint>
#include <string>
@@ -123,8 +126,7 @@ class GTEST_API_ String {
// Unlike strcasecmp(), this function can handle NULL argument(s).
// A NULL C string is considered different to any non-NULL C string,
// including the empty string.
- static bool CaseInsensitiveCStringEquals(const char* lhs,
- const char* rhs);
+ static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
// Compares two wide C strings, ignoring case. Returns true if and only if
// they have the same content.
@@ -143,8 +145,8 @@ class GTEST_API_ String {
// Returns true if and only if the given string ends with the given suffix,
// ignoring case. Any string is considered to end with an empty suffix.
- static bool EndsWithCaseInsensitive(
- const std::string& str, const std::string& suffix);
+ static bool EndsWithCaseInsensitive(const std::string& str,
+ const std::string& suffix);
// Formats an int value as "%02d".
static std::string FormatIntWidth2(int value); // "%02d" for width == 2
@@ -163,7 +165,7 @@ class GTEST_API_ String {
private:
String(); // Not meant to be instantiated.
-}; // class String
+}; // class String
// Gets the content of the stringstream's buffer as an std::string. Each '\0'
// character in the buffer is replaced with "\\0".
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index b87a2e2ca..6bc02a7de 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -30,7 +30,9 @@
// Type utilities needed for implementing typed and type-parameterized
// tests.
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
@@ -39,11 +41,11 @@
// #ifdef __GNUC__ is too general here. It is possible to use gcc without using
// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-# include <cxxabi.h>
-# elif defined(__HP_aCC)
-# include <acxx_demangle.h>
-# endif // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif // GTEST_HASH_CXXABI_H_
namespace testing {
namespace internal {
@@ -101,7 +103,9 @@ std::string GetTypeName() {
// A unique type indicating an empty node
struct None {};
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+ template <typename T> \
+ class
// The template "selector" struct TemplateSel<Tmpl> is used to
// represent Tmpl, which must be a class template with one type
@@ -119,8 +123,7 @@ struct TemplateSel {
};
};
-# define GTEST_BIND_(TmplSel, T) \
- TmplSel::template Bind<T>::type
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
struct Templates {
diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc
index ad292905c..2a70ed88c 100644
--- a/third_party/googletest/src/src/gtest-all.cc
+++ b/third_party/googletest/src/src/gtest-all.cc
@@ -38,7 +38,7 @@
#include "gtest/gtest.h"
// The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
+#include "src/gtest-assertion-result.cc"
#include "src/gtest-death-test.cc"
#include "src/gtest-filepath.cc"
#include "src/gtest-matchers.cc"
@@ -46,3 +46,4 @@
#include "src/gtest-printers.cc"
#include "src/gtest-test-part.cc"
#include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/third_party/googletest/src/src/gtest-assertion-result.cc b/third_party/googletest/src/src/gtest-assertion-result.cc
new file mode 100644
index 000000000..f1c0b10dc
--- /dev/null
+++ b/third_party/googletest/src/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+ : success_(other.success_),
+ message_(other.message_.get() != nullptr
+ ? new ::std::string(*other.message_)
+ : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+ using std::swap;
+ swap(success_, other.success_);
+ swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+ AssertionResult negation(!success_);
+ if (message_.get() != nullptr) negation << *message_;
+ return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+ return AssertionFailure() << message;
+}
+
+} // namespace testing
diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc
index bf4f6331d..e6abc6278 100644
--- a/third_party/googletest/src/src/gtest-death-test.cc
+++ b/third_party/googletest/src/src/gtest-death-test.cc
@@ -35,49 +35,49 @@
#include <functional>
#include <utility>
-#include "gtest/internal/gtest-port.h"
#include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
#if GTEST_HAS_DEATH_TEST
-# if GTEST_OS_MAC
-# include <crt_externs.h>
-# endif // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-# include <signal.h>
-# endif // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-# include <windows.h>
-# else
-# include <sys/mman.h>
-# include <sys/wait.h>
-# endif // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-# include <spawn.h>
-# endif // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-# include <lib/fdio/fd.h>
-# include <lib/fdio/io.h>
-# include <lib/fdio/spawn.h>
-# include <lib/zx/channel.h>
-# include <lib/zx/port.h>
-# include <lib/zx/process.h>
-# include <lib/zx/socket.h>
-# include <zircon/processargs.h>
-# include <zircon/syscalls.h>
-# include <zircon/syscalls/policy.h>
-# include <zircon/syscalls/port.h>
-# endif // GTEST_OS_FUCHSIA
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif // GTEST_OS_FUCHSIA
#endif // GTEST_HAS_DEATH_TEST
@@ -96,9 +96,12 @@ namespace testing {
// used internally at Google, is "threadsafe".
static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+} // namespace testing
+
GTEST_DEFINE_string_(
death_test_style,
- internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+ testing::internal::StringFromGTestEnv("death_test_style",
+ testing::kDefaultDeathTestStyle),
"Indicates how to run a death test in a forked child process: "
"\"threadsafe\" (child process re-executes the test binary "
"from the beginning, running only the specific death test) or "
@@ -107,7 +110,7 @@ GTEST_DEFINE_string_(
GTEST_DEFINE_bool_(
death_test_use_fork,
- internal::BoolFromGTestEnv("death_test_use_fork", false),
+ testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
"Instructs to use fork()/_exit() instead of clone() in death tests. "
"Ignored and always uses fork() on POSIX systems where clone() is not "
"implemented. Useful when running under valgrind or similar tools if "
@@ -117,7 +120,6 @@ GTEST_DEFINE_bool_(
"work in 99% of the cases. Once valgrind is fixed, this flag will "
"most likely be removed.");
-namespace internal {
GTEST_DEFINE_string_(
internal_run_death_test, "",
"Indicates the file, line number, temporal index of "
@@ -126,7 +128,8 @@ GTEST_DEFINE_string_(
"the '|' characters. This flag is specified if and only if the "
"current process is a sub-process launched for running a thread-safe "
"death test. FOR INTERNAL USE ONLY.");
-} // namespace internal
+
+namespace testing {
#if GTEST_HAS_DEATH_TEST
@@ -134,9 +137,9 @@ namespace internal {
// Valid only for fast death tests. Indicates the code is running in the
// child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
static bool g_in_fast_death_test_child = false;
-# endif
+#endif
// Returns a Boolean value indicating whether the caller is currently
// executing in the context of the death test child process. Tools such as
@@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false;
// tests. IMPORTANT: This is an internal utility. Using it may break the
// implementation of death tests. User code MUST NOT use it.
bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
// On Windows and Fuchsia, death tests are thread-safe regardless of the value
// of the death_test_style flag.
- return !GTEST_FLAG(internal_run_death_test).empty();
+ return !GTEST_FLAG_GET(internal_run_death_test).empty();
-# else
+#else
- if (GTEST_FLAG(death_test_style) == "threadsafe")
- return !GTEST_FLAG(internal_run_death_test).empty();
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+ return !GTEST_FLAG_GET(internal_run_death_test).empty();
else
return g_in_fast_death_test_child;
#endif
@@ -162,40 +165,38 @@ bool InDeathTestChild() {
} // namespace internal
// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
// ExitedWithCode function-call operator.
bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
return exit_status == exit_code_;
-# else
+#else
return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
}
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
// KilledBySignal function-call operator.
bool KilledBySignal::operator()(int exit_status) const {
-# if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
{
bool result;
if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
return result;
}
}
-# endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
}
-# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
namespace internal {
@@ -206,23 +207,23 @@ namespace internal {
static std::string ExitSummary(int exit_code) {
Message m;
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
m << "Exited with exit status " << exit_code;
-# else
+#else
if (WIFEXITED(exit_code)) {
m << "Exited with exit status " << WEXITSTATUS(exit_code);
} else if (WIFSIGNALED(exit_code)) {
m << "Terminated by signal " << WTERMSIG(exit_code);
}
-# ifdef WCOREDUMP
+#ifdef WCOREDUMP
if (WCOREDUMP(exit_code)) {
m << " (core dumped)";
}
-# endif
-# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif
+#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
return m.GetString();
}
@@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) {
return !ExitedWithCode(0)(exit_status);
}
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
// Generates a textual failure message when a death test finds more than
// one thread running, or cannot determine the number of threads, prior
// to executing the given statement. It is the responsibility of the
@@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
<< " this is the last message you see before your test times out.";
return msg.GetString();
}
-# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
// Flag characters for reporting a death test that did not die.
static const char kDeathTestLived = 'L';
@@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) {
// A replacement for CHECK that calls DeathTestAbort if the assertion
// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
- do { \
- if (!::testing::internal::IsTrue(expression)) { \
- DeathTestAbort( \
- ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
- + ::testing::internal::StreamableToString(__LINE__) + ": " \
- + #expression); \
- } \
+#define GTEST_DEATH_TEST_CHECK_(expression) \
+ do { \
+ if (!::testing::internal::IsTrue(expression)) { \
+ DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \
+ ", line " + \
+ ::testing::internal::StreamableToString(__LINE__) + \
+ ": " + #expression); \
+ } \
} while (::testing::internal::AlwaysFalse())
// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) {
// evaluates the expression as long as it evaluates to -1 and sets
// errno to EINTR. If the expression evaluates to -1 but errno is
// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
- do { \
- int gtest_retval; \
- do { \
- gtest_retval = (expression); \
- } while (gtest_retval == -1 && errno == EINTR); \
- if (gtest_retval == -1) { \
- DeathTestAbort( \
- ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
- + ::testing::internal::StreamableToString(__LINE__) + ": " \
- + #expression + " != -1"); \
- } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+ do { \
+ int gtest_retval; \
+ do { \
+ gtest_retval = (expression); \
+ } while (gtest_retval == -1 && errno == EINTR); \
+ if (gtest_retval == -1) { \
+ DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \
+ ", line " + \
+ ::testing::internal::StreamableToString(__LINE__) + \
+ ": " + #expression + " != -1"); \
+ } \
} while (::testing::internal::AlwaysFalse())
// Returns the message describing the last system error in errno.
std::string GetLastErrnoDescription() {
- return errno == 0 ? "" : posix::StrError(errno);
+ return errno == 0 ? "" : posix::StrError(errno);
}
// This is called from a death test parent process to read a failure
@@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) {
DeathTest::DeathTest() {
TestInfo* const info = GetUnitTestImpl()->current_test_info();
if (info == nullptr) {
- DeathTestAbort("Cannot run a death test outside of a TEST or "
- "TEST_F construct");
+ DeathTestAbort(
+ "Cannot run a death test outside of a TEST or "
+ "TEST_F construct");
}
}
@@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
set_read_fd(-1);
}
-std::string DeathTestImpl::GetErrorLogs() {
- return GetCapturedStderr();
-}
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
// Signals that the death test code which should have exited, didn't.
// Should be called only in a death test child process.
@@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
// The parent process considers the death test to be a failure if
// it finds any data in our pipe. So, here we write a single flag byte
// to the pipe, then exit.
- const char status_ch =
- reason == TEST_DID_NOT_DIE ? kDeathTestLived :
- reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+ const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived
+ : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+ : kDeathTestReturned;
GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
// We are leaking the descriptor here because on some platforms (i.e.,
@@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) {
// much easier.
static ::std::string FormatDeathTestOutput(const ::std::string& output) {
::std::string ret;
- for (size_t at = 0; ; ) {
+ for (size_t at = 0;;) {
const size_t line_end = output.find('\n', at);
ret += "[ DEATH ] ";
if (line_end == ::std::string::npos) {
@@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) {
// the first failing condition, in the order given above, is the one that is
// reported. Also sets the last death test message string.
bool DeathTestImpl::Passed(bool status_ok) {
- if (!spawned())
- return false;
+ if (!spawned()) return false;
const std::string error_message = GetErrorLogs();
@@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) {
switch (outcome()) {
case LIVED:
buffer << " Result: failed to die.\n"
- << " Error msg:\n" << FormatDeathTestOutput(error_message);
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
break;
case THREW:
buffer << " Result: threw an exception.\n"
- << " Error msg:\n" << FormatDeathTestOutput(error_message);
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
break;
case RETURNED:
buffer << " Result: illegal return in test statement.\n"
- << " Error msg:\n" << FormatDeathTestOutput(error_message);
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
break;
case DIED:
if (status_ok) {
@@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) {
} else {
buffer << " Result: died but not with expected exit code:\n"
<< " " << ExitSummary(status()) << "\n"
- << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+ << "Actual msg:\n"
+ << FormatDeathTestOutput(error_message);
}
break;
case IN_PROGRESS:
@@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
return success;
}
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
// WindowsDeathTest implements death tests on Windows. Due to the
// specifics of starting new processes on Windows, death tests there are
// always threadsafe, and Google Test considers the
@@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl {
// status, or 0 if no child process exists. As a side effect, sets the
// outcome data member.
int WindowsDeathTest::Wait() {
- if (!spawned())
- return 0;
+ if (!spawned()) return 0;
// Wait until the child either signals that it has acquired the write end
// of the pipe or it dies.
- const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
- switch (::WaitForMultipleObjects(2,
- wait_handles,
+ const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+ switch (::WaitForMultipleObjects(2, wait_handles,
FALSE, // Waits for any of the handles.
INFINITE)) {
case WAIT_OBJECT_0:
@@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() {
// returns immediately if the child has already exited, regardless of
// whether previous calls to WaitForMultipleObjects synchronized on this
// handle or not.
- GTEST_DEATH_TEST_CHECK_(
- WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
- INFINITE));
+ GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+ ::WaitForSingleObject(child_handle_.Get(), INFINITE));
DWORD status_code;
GTEST_DEATH_TEST_CHECK_(
::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
nullptr, TRUE};
HANDLE read_handle, write_handle;
- GTEST_DEATH_TEST_CHECK_(
- ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
- 0) // Default buffer size.
- != FALSE);
- set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
- O_RDONLY));
+ GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+ &handles_are_inheritable,
+ 0) // Default buffer size.
+ != FALSE);
+ set_read_fd(
+ ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
write_handle_.Reset(write_handle);
event_handle_.Reset(::CreateEvent(
&handles_are_inheritable,
@@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
nullptr)); // The even is unnamed.
GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
- kFilterFlag + "=" + info->test_suite_name() +
- "." + info->name();
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
const std::string internal_flag =
- std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
- "=" + file_ + "|" + StreamableToString(line_) + "|" +
- StreamableToString(death_test_index) + "|" +
+ std::string("--") + GTEST_FLAG_PREFIX_ +
+ "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+ "|" + StreamableToString(death_test_index) + "|" +
StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
// size_t has the same width as pointers on both 32-bit and 64-bit
// Windows platforms.
// See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
- "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
- "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+ "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+ StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
char executable_path[_MAX_PATH + 1]; // NOLINT
GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
executable_path,
_MAX_PATH));
- std::string command_line =
- std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
- internal_flag + "\"";
+ std::string command_line = std::string(::GetCommandLineA()) + " " +
+ filter_flag + " \"" + internal_flag + "\"";
DeathTest::set_last_death_test_message("");
@@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
GTEST_DEATH_TEST_CHECK_(
::CreateProcessA(
executable_path, const_cast<char*>(command_line.c_str()),
- nullptr, // Retuned process handle is not inheritable.
- nullptr, // Retuned thread handle is not inheritable.
+ nullptr, // Returned process handle is not inheritable.
+ nullptr, // Returned thread handle is not inheritable.
TRUE, // Child inherits all inheritable handles (for write_handle_).
0x0, // Default creation flags.
nullptr, // Inherit the parent's environment.
@@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
return OVERSEE_TEST;
}
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
class FuchsiaDeathTest : public DeathTestImpl {
public:
@@ -855,18 +854,13 @@ class Arguments {
template <typename Str>
void AddArguments(const ::std::vector<Str>& arguments) {
for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
- i != arguments.end();
- ++i) {
+ i != arguments.end(); ++i) {
args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
}
}
- char* const* Argv() {
- return &args_[0];
- }
+ char* const* Argv() { return &args_[0]; }
- int size() {
- return static_cast<int>(args_.size()) - 1;
- }
+ int size() { return static_cast<int>(args_.size()) - 1; }
private:
std::vector<char*> args_;
@@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() {
const int kSocketKey = 1;
const int kExceptionKey = 2;
- if (!spawned())
- return 0;
+ if (!spawned()) return 0;
// Create a port to wait for socket/task/exception events.
zx_status_t status_zx;
@@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() {
GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
// Register to wait for the child process to terminate.
- status_zx = child_process_.wait_async(
- port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+ status_zx =
+ child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
// Register to wait for the socket to be readable or closed.
@@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() {
GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
// Register to wait for an exception.
- status_zx = exception_channel_.wait_async(
- port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
+ status_zx = exception_channel_.wait_async(port, kExceptionKey,
+ ZX_CHANNEL_READABLE, 0);
GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
bool process_terminated = false;
@@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() {
size_t old_length = captured_stderr_.length();
size_t bytes_read = 0;
captured_stderr_.resize(old_length + kBufferSize);
- status_zx = stderr_socket_.read(
- 0, &captured_stderr_.front() + old_length, kBufferSize,
- &bytes_read);
+ status_zx =
+ stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+ kBufferSize, &bytes_read);
captured_stderr_.resize(old_length + bytes_read);
} while (status_zx == ZX_OK);
if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
// Build the child process command line.
const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
- kFilterFlag + "=" + info->test_suite_name() +
- "." + info->name();
- const std::string internal_flag =
- std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
- + file_ + "|"
- + StreamableToString(line_) + "|"
- + StreamableToString(death_test_index);
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
+ const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ kInternalRunDeathTestFlag + "=" + file_ +
+ "|" + StreamableToString(line_) + "|" +
+ StreamableToString(death_test_index);
Arguments args;
args.AddArguments(GetInjectableArgvs());
args.AddArgument(filter_flag.c_str());
@@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
// Create a socket pair will be used to receive the child process' stderr.
zx::socket stderr_producer_socket;
- status =
- zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+ status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
GTEST_DEATH_TEST_CHECK_(status >= 0);
int stderr_producer_fd = -1;
status =
@@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
// Create a child job.
zx_handle_t child_job = ZX_HANDLE_INVALID;
- status = zx_job_create(zx_job_default(), 0, & child_job);
+ status = zx_job_create(zx_job_default(), 0, &child_job);
GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
zx_policy_basic_t policy;
policy.condition = ZX_POL_NEW_ANY;
policy.policy = ZX_POL_ACTION_ALLOW;
- status = zx_job_set_policy(
- child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+ status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+ &policy, 1);
GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
// Create an exception channel attached to the |child_job|, to allow
// us to suppress the system default exception handler from firing.
- status =
- zx_task_create_exception_channel(
- child_job, 0, exception_channel_.reset_and_get_address());
+ status = zx_task_create_exception_channel(
+ child_job, 0, exception_channel_.reset_and_get_address());
GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
// Spawn the child process.
- status = fdio_spawn_etc(
- child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
- 2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+ status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+ args.Argv(), nullptr, 2, spawn_actions,
+ child_process_.reset_and_get_address(), nullptr);
GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
set_spawned(true);
return OVERSEE_TEST;
}
-std::string FuchsiaDeathTest::GetErrorLogs() {
- return captured_stderr_;
-}
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
#else // We are neither on Windows, nor on Fuchsia.
@@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement,
// status, or 0 if no child process exists. As a side effect, sets the
// outcome data member.
int ForkingDeathTest::Wait() {
- if (!spawned())
- return 0;
+ if (!spawned()) return 0;
ReadAndInterpretStatusByte();
@@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest {
private:
static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
::std::vector<std::string> args = GetInjectableArgvs();
-# if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
::std::vector<std::string> extra_args =
GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
args.insert(args.end(), extra_args.begin(), extra_args.end());
-# endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
return args;
}
// The name of the file in which the death test is located.
@@ -1204,14 +1191,11 @@ class Arguments {
template <typename Str>
void AddArguments(const ::std::vector<Str>& arguments) {
for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
- i != arguments.end();
- ++i) {
+ i != arguments.end(); ++i) {
args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
}
}
- char* const* Argv() {
- return &args_[0];
- }
+ char* const* Argv() { return &args_[0]; }
private:
std::vector<char*> args_;
@@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs {
int close_fd; // File descriptor to close; the read end of a pipe
};
-# if GTEST_OS_QNX
+#if GTEST_OS_QNX
extern "C" char** environ;
-# else // GTEST_OS_QNX
+#else // GTEST_OS_QNX
// The main function for a threadsafe-style death test child process.
// This function is called in a clone()-ed process and thus must avoid
// any potentially unsafe operations like malloc or libc functions.
@@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) {
UnitTest::GetInstance()->original_working_dir();
// We can safely call chdir() as it's a direct system call.
if (chdir(original_dir) != 0) {
- DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
- GetLastErrnoDescription());
+ DeathTestAbort(std::string("chdir(\"") + original_dir +
+ "\") failed: " + GetLastErrnoDescription());
return EXIT_FAILURE;
}
@@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) {
// one path separator.
execv(args->argv[0], args->argv);
DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
- original_dir + " failed: " +
- GetLastErrnoDescription());
+ original_dir + " failed: " + GetLastErrnoDescription());
return EXIT_FAILURE;
}
-# endif // GTEST_OS_QNX
+#endif // GTEST_OS_QNX
-# if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
// Two utility routines that together determine the direction the stack
// grows.
// This could be accomplished more elegantly by a single recursive
@@ -1293,7 +1276,7 @@ static bool StackGrowsDown() {
StackLowerThanAddress(&dummy, &result);
return result;
}
-# endif // GTEST_HAS_CLONE
+#endif // GTEST_HAS_CLONE
// Spawns a child process with the same executable as the current process in
// a thread-safe manner and instructs it to run the death test. The
@@ -1303,10 +1286,10 @@ static bool StackGrowsDown() {
// spawn(2) there instead. The function dies with an error message if
// anything goes wrong.
static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
- ExecDeathTestArgs args = { argv, close_fd };
+ ExecDeathTestArgs args = {argv, close_fd};
pid_t child_pid = -1;
-# if GTEST_OS_QNX
+#if GTEST_OS_QNX
// Obtains the current directory and sets it to be closed in the child
// process.
const int cwd_fd = open(".", O_RDONLY);
@@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
UnitTest::GetInstance()->original_working_dir();
// We can safely call chdir() as it's a direct system call.
if (chdir(original_dir) != 0) {
- DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
- GetLastErrnoDescription());
+ DeathTestAbort(std::string("chdir(\"") + original_dir +
+ "\") failed: " + GetLastErrnoDescription());
return EXIT_FAILURE;
}
int fd_flags;
// Set close_fd to be closed after spawn.
GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
- GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
- fd_flags | FD_CLOEXEC));
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(
+ fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
struct inheritance inherit = {0};
// spawn is a system call.
child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
@@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
-# else // GTEST_OS_QNX
-# if GTEST_OS_LINUX
+#else // GTEST_OS_QNX
+#if GTEST_OS_LINUX
// When a SIGPROF signal is received while fork() or clone() are executing,
// the process may hang. To avoid this, we ignore SIGPROF here and re-enable
// it after the call to fork()/clone() is complete.
@@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
sigemptyset(&ignore_sigprof_action.sa_mask);
ignore_sigprof_action.sa_handler = SIG_IGN;
- GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
- SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-# endif // GTEST_OS_LINUX
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(
+ sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif // GTEST_OS_LINUX
-# if GTEST_HAS_CLONE
- const bool use_fork = GTEST_FLAG(death_test_use_fork);
+#if GTEST_HAS_CLONE
+ const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
if (!use_fork) {
static const bool stack_grows_down = StackGrowsDown();
@@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
const size_t kMaxStackAlignment = 64;
void* const stack_top =
static_cast<char*>(stack) +
- (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+ (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
GTEST_DEATH_TEST_CHECK_(
static_cast<size_t>(stack_size) > kMaxStackAlignment &&
reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
}
-# else
+#else
const bool use_fork = true;
-# endif // GTEST_HAS_CLONE
+#endif // GTEST_HAS_CLONE
if (use_fork && (child_pid = fork()) == 0) {
- ExecDeathTestChildMain(&args);
- _exit(0);
+ ExecDeathTestChildMain(&args);
+ _exit(0);
}
-# endif // GTEST_OS_QNX
-# if GTEST_OS_LINUX
+#endif // GTEST_OS_QNX
+#if GTEST_OS_LINUX
GTEST_DEATH_TEST_CHECK_SYSCALL_(
sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-# endif // GTEST_OS_LINUX
+#endif // GTEST_OS_LINUX
GTEST_DEATH_TEST_CHECK_(child_pid != -1);
return child_pid;
@@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
- kFilterFlag + "=" + info->test_suite_name() +
- "." + info->name();
- const std::string internal_flag =
- std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
- + file_ + "|" + StreamableToString(line_) + "|"
- + StreamableToString(death_test_index) + "|"
- + StreamableToString(pipe_fd[1]);
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
+ const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ "internal_run_death_test=" + file_ + "|" +
+ StreamableToString(line_) + "|" +
+ StreamableToString(death_test_index) + "|" +
+ StreamableToString(pipe_fd[1]);
Arguments args;
args.AddArguments(GetArgvsForDeathTestChildProcess());
args.AddArgument(filter_flag.c_str());
@@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
return OVERSEE_TEST;
}
-# endif // !GTEST_OS_WINDOWS
+#endif // !GTEST_OS_WINDOWS
// Creates a concrete DeathTest-derived class that depends on the
// --gtest_death_test_style flag, and sets the pointer pointed to
@@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement,
UnitTestImpl* const impl = GetUnitTestImpl();
const InternalRunDeathTestFlag* const flag =
impl->internal_run_death_test_flag();
- const int death_test_index = impl->current_test_info()
- ->increment_death_test_count();
+ const int death_test_index =
+ impl->current_test_info()->increment_death_test_count();
if (flag != nullptr) {
if (death_test_index > flag->index()) {
DeathTest::set_last_death_test_message(
- "Death test count (" + StreamableToString(death_test_index)
- + ") somehow exceeded expected maximum ("
- + StreamableToString(flag->index()) + ")");
+ "Death test count (" + StreamableToString(death_test_index) +
+ ") somehow exceeded expected maximum (" +
+ StreamableToString(flag->index()) + ")");
return false;
}
@@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement,
}
}
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
- if (GTEST_FLAG(death_test_style) == "threadsafe" ||
- GTEST_FLAG(death_test_style) == "fast") {
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+ GTEST_FLAG_GET(death_test_style) == "fast") {
*test = new WindowsDeathTest(statement, std::move(matcher), file, line);
}
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
- if (GTEST_FLAG(death_test_style) == "threadsafe" ||
- GTEST_FLAG(death_test_style) == "fast") {
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+ GTEST_FLAG_GET(death_test_style) == "fast") {
*test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
}
-# else
+#else
- if (GTEST_FLAG(death_test_style) == "threadsafe") {
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
*test = new ExecDeathTest(statement, std::move(matcher), file, line);
- } else if (GTEST_FLAG(death_test_style) == "fast") {
+ } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
*test = new NoExecDeathTest(statement, std::move(matcher));
}
-# endif // GTEST_OS_WINDOWS
+#endif // GTEST_OS_WINDOWS
else { // NOLINT - this is more readable than unbalanced brackets inside #if.
- DeathTest::set_last_death_test_message(
- "Unknown death test style \"" + GTEST_FLAG(death_test_style)
- + "\" encountered");
+ DeathTest::set_last_death_test_message("Unknown death test style \"" +
+ GTEST_FLAG_GET(death_test_style) +
+ "\" encountered");
return false;
}
return true;
}
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
// Recreates the pipe and event handles from the provided parameters,
// signals the event, and returns a file descriptor wrapped around the pipe
// handle. This function is called in the child process only.
static int GetStatusFileDescriptor(unsigned int parent_process_id,
- size_t write_handle_as_size_t,
- size_t event_handle_as_size_t) {
+ size_t write_handle_as_size_t,
+ size_t event_handle_as_size_t) {
AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
- FALSE, // Non-inheritable.
- parent_process_id));
+ FALSE, // Non-inheritable.
+ parent_process_id));
if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
DeathTestAbort("Unable to open parent process " +
StreamableToString(parent_process_id));
@@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
- const HANDLE write_handle =
- reinterpret_cast<HANDLE>(write_handle_as_size_t);
+ const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
HANDLE dup_write_handle;
// The newly initialized handle is accessible only in the parent
@@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
HANDLE dup_event_handle;
if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
- ::GetCurrentProcess(), &dup_event_handle,
- 0x0,
- FALSE,
+ ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
DUPLICATE_SAME_ACCESS)) {
DeathTestAbort("Unable to duplicate the event handle " +
StreamableToString(event_handle_as_size_t) +
@@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
return write_fd;
}
-# endif // GTEST_OS_WINDOWS
+#endif // GTEST_OS_WINDOWS
// Returns a newly created InternalRunDeathTestFlag object with fields
// initialized from the GTEST_FLAG(internal_run_death_test) flag if
// the flag is specified; otherwise returns NULL.
InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
- if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+ if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
// can use it here.
int line = -1;
int index = -1;
::std::vector< ::std::string> fields;
- SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+ SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
int write_fd = -1;
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
unsigned int parent_process_id = 0;
size_t write_handle_as_size_t = 0;
size_t event_handle_as_size_t = 0;
- if (fields.size() != 6
- || !ParseNaturalNumber(fields[1], &line)
- || !ParseNaturalNumber(fields[2], &index)
- || !ParseNaturalNumber(fields[3], &parent_process_id)
- || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
- || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+ if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index) ||
+ !ParseNaturalNumber(fields[3], &parent_process_id) ||
+ !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+ !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
- GTEST_FLAG(internal_run_death_test));
+ GTEST_FLAG_GET(internal_run_death_test));
}
- write_fd = GetStatusFileDescriptor(parent_process_id,
- write_handle_as_size_t,
+ write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
event_handle_as_size_t);
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
- if (fields.size() != 3
- || !ParseNaturalNumber(fields[1], &line)
- || !ParseNaturalNumber(fields[2], &index)) {
- DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
- + GTEST_FLAG(internal_run_death_test));
+ if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index)) {
+ DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+ GTEST_FLAG_GET(internal_run_death_test));
}
-# else
+#else
- if (fields.size() != 4
- || !ParseNaturalNumber(fields[1], &line)
- || !ParseNaturalNumber(fields[2], &index)
- || !ParseNaturalNumber(fields[3], &write_fd)) {
- DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
- + GTEST_FLAG(internal_run_death_test));
+ if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index) ||
+ !ParseNaturalNumber(fields[3], &write_fd)) {
+ DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+ GTEST_FLAG_GET(internal_run_death_test));
}
-# endif // GTEST_OS_WINDOWS
+#endif // GTEST_OS_WINDOWS
return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
}
diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc
index 0b5629401..f6ee90cdb 100644
--- a/third_party/googletest/src/src/gtest-filepath.cc
+++ b/third_party/googletest/src/src/gtest-filepath.cc
@@ -30,29 +30,31 @@
#include "gtest/internal/gtest-filepath.h"
#include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
+
#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
#elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
+#include <direct.h>
+#include <io.h>
#else
-# include <limits.h>
-# include <climits> // Some Linux distributions define PATH_MAX here.
-#endif // GTEST_OS_WINDOWS_MOBILE
+#include <limits.h>
+
+#include <climits> // Some Linux distributions define PATH_MAX here.
+#endif // GTEST_OS_WINDOWS_MOBILE
#include "gtest/internal/gtest-string.h"
#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
+#define GTEST_PATH_MAX_ _MAX_PATH
#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
+#define GTEST_PATH_MAX_ PATH_MAX
#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
#endif // GTEST_OS_WINDOWS
namespace testing {
@@ -66,16 +68,16 @@ namespace internal {
const char kPathSeparator = '\\';
const char kAlternatePathSeparator = '/';
const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
// Windows CE doesn't have a current directory. You should not use
// the current directory in tests on Windows CE, but this at least
// provides a reasonable fallback.
const char kCurrentDirectoryString[] = "\\";
// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
const char kCurrentDirectoryString[] = ".\\";
-# endif // GTEST_OS_WINDOWS_MOBILE
+#endif // GTEST_OS_WINDOWS_MOBILE
#else
const char kPathSeparator = '/';
const char kCurrentDirectoryString[] = "./";
@@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() {
// something reasonable.
return FilePath(kCurrentDirectoryString);
#elif GTEST_OS_WINDOWS
- char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+ char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
#else
- char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+ char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+#if GTEST_OS_NACL
// getcwd will likely fail in NaCl due to the sandbox, so return something
// reasonable. The user may have provided a shim implementation for getcwd,
// however, so fallback only when failure is detected.
return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif // GTEST_OS_NACL
+#endif // GTEST_OS_NACL
return FilePath(result == nullptr ? "" : cwd);
#endif // GTEST_OS_WINDOWS_MOBILE
}
@@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() {
FilePath FilePath::RemoveExtension(const char* extension) const {
const std::string dot_extension = std::string(".") + extension;
if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
- return FilePath(pathname_.substr(
- 0, pathname_.length() - dot_extension.length()));
+ return FilePath(
+ pathname_.substr(0, pathname_.length() - dot_extension.length()));
}
return *this;
}
@@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const {
// than zero (e.g., 12), returns "dir/test_12.xml".
// On Windows platform, uses \ as the separator rather than /.
FilePath FilePath::MakeFileName(const FilePath& directory,
- const FilePath& base_name,
- int number,
+ const FilePath& base_name, int number,
const char* extension) {
std::string file;
if (number == 0) {
file = base_name.string() + "." + extension;
} else {
- file = base_name.string() + "_" + StreamableToString(number)
- + "." + extension;
+ file =
+ base_name.string() + "_" + StreamableToString(number) + "." + extension;
}
return ConcatPaths(directory, FilePath(file));
}
@@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
// On Windows, uses \ as the separator rather than /.
FilePath FilePath::ConcatPaths(const FilePath& directory,
const FilePath& relative_path) {
- if (directory.IsEmpty())
- return relative_path;
+ if (directory.IsEmpty()) return relative_path;
const FilePath dir(directory.RemoveTrailingPathSeparator());
return FilePath(dir.string() + kPathSeparator + relative_path.string());
}
@@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const {
#if GTEST_OS_WINDOWS_MOBILE
LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
const DWORD attributes = GetFileAttributes(unicode);
- delete [] unicode;
+ delete[] unicode;
return attributes != kInvalidFileAttributes;
#else
posix::StatStruct file_stat{};
@@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const {
#if GTEST_OS_WINDOWS
// Don't strip off trailing separator if path is a root directory on
// Windows (like "C:\\").
- const FilePath& path(IsRootDirectory() ? *this :
- RemoveTrailingPathSeparator());
+ const FilePath& path(IsRootDirectory() ? *this
+ : RemoveTrailingPathSeparator());
#else
const FilePath& path(*this);
#endif
@@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const {
#if GTEST_OS_WINDOWS_MOBILE
LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
const DWORD attributes = GetFileAttributes(unicode);
- delete [] unicode;
+ delete[] unicode;
if ((attributes != kInvalidFileAttributes) &&
(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
result = true;
}
#else
posix::StatStruct file_stat{};
- result = posix::Stat(path.c_str(), &file_stat) == 0 &&
- posix::IsDir(file_stat);
+ result =
+ posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
#endif // GTEST_OS_WINDOWS_MOBILE
return result;
@@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const {
const char* const name = pathname_.c_str();
#if GTEST_OS_WINDOWS
return pathname_.length() >= 3 &&
- ((name[0] >= 'a' && name[0] <= 'z') ||
- (name[0] >= 'A' && name[0] <= 'Z')) &&
- name[1] == ':' &&
- IsPathSeparator(name[2]);
+ ((name[0] >= 'a' && name[0] <= 'z') ||
+ (name[0] >= 'A' && name[0] <= 'Z')) &&
+ name[1] == ':' && IsPathSeparator(name[2]);
#else
return IsPathSeparator(name[0]);
#endif
@@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const {
FilePath removed_sep(this->RemoveTrailingPathSeparator());
LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
- delete [] unicode;
+ delete[] unicode;
#elif GTEST_OS_WINDOWS
int result = _mkdir(pathname_.c_str());
#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
@@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const {
// name, otherwise return the name string unmodified.
// On Windows platform, uses \ as the separator, other platforms use /.
FilePath FilePath::RemoveTrailingPathSeparator() const {
- return IsDirectory()
- ? FilePath(pathname_.substr(0, pathname_.length() - 1))
- : *this;
+ return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+ : *this;
}
// Removes any redundant separators that might be in the pathname.
diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h
index 6d8cecbbb..0b9e929c6 100644
--- a/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/third_party/googletest/src/src/gtest-internal-inl.h
@@ -35,7 +35,7 @@
#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
#ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
#endif // !_WIN32_WCE
#include <stddef.h>
#include <stdlib.h> // For strtoll/_strtoul64/malloc/free.
@@ -50,22 +50,20 @@
#include "gtest/internal/gtest-port.h"
#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h> // NOLINT
-# include <netdb.h> // NOLINT
+#include <arpa/inet.h> // NOLINT
+#include <netdb.h> // NOLINT
#endif
#if GTEST_OS_WINDOWS
-# include <windows.h> // NOLINT
-#endif // GTEST_OS_WINDOWS
+#include <windows.h> // NOLINT
+#endif // GTEST_OS_WINDOWS
-#include "gtest/gtest.h"
#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
/* class A needs to have dll-interface to be used by clients of class B */)
-namespace testing {
-
// Declares the flags.
//
// We don't want the users to modify this flag in the code, but want
@@ -73,32 +71,13 @@ namespace testing {
// declare it here as opposed to in gtest.h.
GTEST_DECLARE_bool_(death_test_use_fork);
+namespace testing {
namespace internal {
// The value of GetTestTypeId() as seen from within the Google Test
// library. This is solely for testing GetTestTypeId().
GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFailFast[] = "fail_fast";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kBriefFlag[] = "brief";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
// A valid random seed must be in [1, kMaxRandomSeed].
const int kMaxRandomSeed = 99999;
@@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
- const char* str, const char* flag, int32_t* value);
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
// Returns a random seed in range [1, kMaxRandomSeed] based on the
// given --gtest_random_seed flag value.
inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
- const unsigned int raw_seed = (random_seed_flag == 0) ?
- static_cast<unsigned int>(GetTimeInMillis()) :
- static_cast<unsigned int>(random_seed_flag);
+ const unsigned int raw_seed =
+ (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+ : static_cast<unsigned int>(random_seed_flag);
// Normalizes the actual seed to range [1, kMaxRandomSeed] such that
// it's easy to type.
const int normalized_seed =
static_cast<int>((raw_seed - 1U) %
- static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+ static_cast<unsigned int>(kMaxRandomSeed)) +
+ 1;
return normalized_seed;
}
@@ -160,50 +139,54 @@ class GTestFlagSaver {
public:
// The c'tor.
GTestFlagSaver() {
- also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
- break_on_failure_ = GTEST_FLAG(break_on_failure);
- catch_exceptions_ = GTEST_FLAG(catch_exceptions);
- color_ = GTEST_FLAG(color);
- death_test_style_ = GTEST_FLAG(death_test_style);
- death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
- fail_fast_ = GTEST_FLAG(fail_fast);
- filter_ = GTEST_FLAG(filter);
- internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
- list_tests_ = GTEST_FLAG(list_tests);
- output_ = GTEST_FLAG(output);
- brief_ = GTEST_FLAG(brief);
- print_time_ = GTEST_FLAG(print_time);
- print_utf8_ = GTEST_FLAG(print_utf8);
- random_seed_ = GTEST_FLAG(random_seed);
- repeat_ = GTEST_FLAG(repeat);
- shuffle_ = GTEST_FLAG(shuffle);
- stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
- stream_result_to_ = GTEST_FLAG(stream_result_to);
- throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+ also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+ break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+ catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+ color_ = GTEST_FLAG_GET(color);
+ death_test_style_ = GTEST_FLAG_GET(death_test_style);
+ death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+ fail_fast_ = GTEST_FLAG_GET(fail_fast);
+ filter_ = GTEST_FLAG_GET(filter);
+ internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+ list_tests_ = GTEST_FLAG_GET(list_tests);
+ output_ = GTEST_FLAG_GET(output);
+ brief_ = GTEST_FLAG_GET(brief);
+ print_time_ = GTEST_FLAG_GET(print_time);
+ print_utf8_ = GTEST_FLAG_GET(print_utf8);
+ random_seed_ = GTEST_FLAG_GET(random_seed);
+ repeat_ = GTEST_FLAG_GET(repeat);
+ recreate_environments_when_repeating_ =
+ GTEST_FLAG_GET(recreate_environments_when_repeating);
+ shuffle_ = GTEST_FLAG_GET(shuffle);
+ stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+ stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+ throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
}
// The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS.
~GTestFlagSaver() {
- GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
- GTEST_FLAG(break_on_failure) = break_on_failure_;
- GTEST_FLAG(catch_exceptions) = catch_exceptions_;
- GTEST_FLAG(color) = color_;
- GTEST_FLAG(death_test_style) = death_test_style_;
- GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
- GTEST_FLAG(filter) = filter_;
- GTEST_FLAG(fail_fast) = fail_fast_;
- GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
- GTEST_FLAG(list_tests) = list_tests_;
- GTEST_FLAG(output) = output_;
- GTEST_FLAG(brief) = brief_;
- GTEST_FLAG(print_time) = print_time_;
- GTEST_FLAG(print_utf8) = print_utf8_;
- GTEST_FLAG(random_seed) = random_seed_;
- GTEST_FLAG(repeat) = repeat_;
- GTEST_FLAG(shuffle) = shuffle_;
- GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
- GTEST_FLAG(stream_result_to) = stream_result_to_;
- GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+ GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+ GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+ GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+ GTEST_FLAG_SET(color, color_);
+ GTEST_FLAG_SET(death_test_style, death_test_style_);
+ GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+ GTEST_FLAG_SET(filter, filter_);
+ GTEST_FLAG_SET(fail_fast, fail_fast_);
+ GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+ GTEST_FLAG_SET(list_tests, list_tests_);
+ GTEST_FLAG_SET(output, output_);
+ GTEST_FLAG_SET(brief, brief_);
+ GTEST_FLAG_SET(print_time, print_time_);
+ GTEST_FLAG_SET(print_utf8, print_utf8_);
+ GTEST_FLAG_SET(random_seed, random_seed_);
+ GTEST_FLAG_SET(repeat, repeat_);
+ GTEST_FLAG_SET(recreate_environments_when_repeating,
+ recreate_environments_when_repeating_);
+ GTEST_FLAG_SET(shuffle, shuffle_);
+ GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+ GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+ GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
}
private:
@@ -224,6 +207,7 @@ class GTestFlagSaver {
bool print_utf8_;
int32_t random_seed_;
int32_t repeat_;
+ bool recreate_environments_when_repeating_;
bool shuffle_;
int32_t stack_trace_depth_;
std::string stream_result_to_;
@@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
// returns true if and only if the test should be run on this shard. The test id
// is some arbitrary but unique non-negative integer assigned to each test
// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
- int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+ int test_id);
// STL container utilities.
@@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) {
// Implemented as an explicit loop since std::count_if() in libCstd on
// Solaris has a non-standard signature.
int count = 0;
- for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
- if (predicate(*it))
- ++count;
+ for (auto it = c.begin(); it != c.end(); ++it) {
+ if (predicate(*it)) ++count;
}
return count;
}
@@ -441,7 +424,9 @@ class OsStackTraceGetterInterface {
static const char* const kElidedFramesMarker;
private:
- GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+ OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+ OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+ delete;
};
// A working implementation of the OsStackTraceGetterInterface interface.
@@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
void* caller_frame_ = nullptr;
#endif // GTEST_HAS_ABSL
- GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+ OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+ OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
};
// Information about a Google Test trace point.
@@ -476,7 +462,7 @@ struct TraceInfo {
// This is the default global test part result reporter used in UnitTestImpl.
// This class should only be used by UnitTestImpl.
class DefaultGlobalTestPartResultReporter
- : public TestPartResultReporterInterface {
+ : public TestPartResultReporterInterface {
public:
explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
// Implements the TestPartResultReporterInterface. Reports the test part
@@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter
private:
UnitTestImpl* const unit_test_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+ DefaultGlobalTestPartResultReporter(
+ const DefaultGlobalTestPartResultReporter&) = delete;
+ DefaultGlobalTestPartResultReporter& operator=(
+ const DefaultGlobalTestPartResultReporter&) = delete;
};
// This is the default per thread test part result reporter used in
@@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter
private:
UnitTestImpl* const unit_test_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+ DefaultPerThreadTestPartResultReporter(
+ const DefaultPerThreadTestPartResultReporter&) = delete;
+ DefaultPerThreadTestPartResultReporter& operator=(
+ const DefaultPerThreadTestPartResultReporter&) = delete;
};
// The private implementation of the UnitTest class. We don't protect
@@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl {
// For example, if Foo() calls Bar(), which in turn calls
// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
- std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+ std::string CurrentOsStackTraceExceptTop(int skip_count)
+ GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
// Finds and returns a TestSuite with the given name. If one doesn't
// exist, creates one and returns it.
@@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl {
}
// Clears the results of ad-hoc test assertions.
- void ClearAdHocTestResult() {
- ad_hoc_test_result_.Clear();
- }
+ void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
// Adds a TestProperty to the current TestResult object when invoked in a
// context of a test or a test suite, or to the global property set. If the
@@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl {
// updated.
void RecordProperty(const TestProperty& test_property);
- enum ReactionToSharding {
- HONOR_SHARDING_PROTOCOL,
- IGNORE_SHARDING_PROTOCOL
- };
+ enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
// Matches the full name of each test against the user-specified
// filter to decide whether the test should run, then records the
@@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl {
// starts.
bool catch_exceptions_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+ UnitTestImpl(const UnitTestImpl&) = delete;
+ UnitTestImpl& operator=(const UnitTestImpl&) = delete;
}; // class UnitTestImpl
// Convenience function for accessing the global UnitTest
@@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch);
GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
GTEST_API_ bool ValidateRegex(const char* regex);
GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
- bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+ char repeat, const char* regex,
+ const char* str);
GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
#endif // GTEST_USES_SIMPLE_RE
@@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener {
}
~SocketWriter() override {
- if (sockfd_ != -1)
- CloseConnection();
+ if (sockfd_ != -1) CloseConnection();
}
// Sends a string to the socket.
@@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener {
const auto len = static_cast<size_t>(message.length());
if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
- GTEST_LOG_(WARNING)
- << "stream_result_to: failed to stream to "
- << host_name_ << ":" << port_num_;
+ GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+ << host_name_ << ":" << port_num_;
}
}
@@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener {
const std::string host_name_;
const std::string port_num_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+ SocketWriter(const SocketWriter&) = delete;
+ SocketWriter& operator=(const SocketWriter&) = delete;
}; // class SocketWriter
// Escapes '=', '&', '%', and '\n' characters in str as "%xx".
@@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener {
}
explicit StreamingListener(AbstractSocketWriter* socket_writer)
- : socket_writer_(socket_writer) { Start(); }
+ : socket_writer_(socket_writer) {
+ Start();
+ }
void OnTestProgramStart(const UnitTest& /* unit_test */) override {
SendLn("event=TestProgramStart");
@@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener {
void OnTestIterationEnd(const UnitTest& unit_test,
int /* iteration */) override {
- SendLn("event=TestIterationEnd&passed=" +
- FormatBool(unit_test.Passed()) + "&elapsed_time=" +
- StreamableToString(unit_test.elapsed_time()) + "ms");
+ SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+ "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+ "ms");
}
// Note that "event=TestCaseStart" is a wire format and has to remain
// "case" for compatibility
- void OnTestCaseStart(const TestCase& test_case) override {
- SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+ void OnTestSuiteStart(const TestSuite& test_suite) override {
+ SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
}
// Note that "event=TestCaseEnd" is a wire format and has to remain
// "case" for compatibility
- void OnTestCaseEnd(const TestCase& test_case) override {
- SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
- "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+ void OnTestSuiteEnd(const TestSuite& test_suite) override {
+ SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+ "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
"ms");
}
@@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener {
void OnTestEnd(const TestInfo& test_info) override {
SendLn("event=TestEnd&passed=" +
- FormatBool((test_info.result())->Passed()) +
- "&elapsed_time=" +
+ FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
StreamableToString((test_info.result())->elapsed_time()) + "ms");
}
@@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener {
const std::unique_ptr<AbstractSocketWriter> socket_writer_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+ StreamingListener(const StreamingListener&) = delete;
+ StreamingListener& operator=(const StreamingListener&) = delete;
}; // class StreamingListener
#endif // GTEST_CAN_STREAM_RESULTS_
diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc
index 65104ebab..7e3bcc0cf 100644
--- a/third_party/googletest/src/src/gtest-matchers.cc
+++ b/third_party/googletest/src/src/gtest-matchers.cc
@@ -32,12 +32,13 @@
// This file implements just enough of the matcher interface to allow
// EXPECT_DEATH and friends to accept a matcher argument.
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
#include "gtest/gtest-matchers.h"
#include <string>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
namespace testing {
// Constructs a matcher that matches a const std::string& whose value is
diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc
index 53a4d37f9..d797fe4d5 100644
--- a/third_party/googletest/src/src/gtest-port.cc
+++ b/third_party/googletest/src/src/gtest-port.cc
@@ -27,61 +27,62 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
#include "gtest/internal/gtest-port.h"
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+
#include <cstdint>
#include <fstream>
#include <memory>
#if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map> // Used in ThreadLocal.
-# ifdef _MSC_VER
-# include <crtdbg.h>
-# endif // _MSC_VER
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map> // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif // _MSC_VER
#else
-# include <unistd.h>
+#include <unistd.h>
#endif // GTEST_OS_WINDOWS
#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
#endif // GTEST_OS_MAC
#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-# include <sys/user.h>
-# endif
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
#endif
#if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
#endif // GTEST_OS_QNX
#if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
#endif // GTEST_OS_AIX
#if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
#endif // GTEST_OS_FUCHSIA
-#include "gtest/gtest-spi.h"
#include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
#include "gtest/internal/gtest-internal.h"
#include "gtest/internal/gtest-string.h"
#include "src/gtest-internal-inl.h"
@@ -89,16 +90,7 @@
namespace testing {
namespace internal {
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif // _MSC_VER
-
-#if GTEST_OS_LINUX
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
namespace {
template <typename T>
@@ -131,8 +123,7 @@ size_t GetThreadCount() {
if (status == KERN_SUCCESS) {
// task_threads allocates resources in thread_list and we need to free them
// to avoid leaks.
- vm_deallocate(task,
- reinterpret_cast<vm_address_t>(thread_list),
+ vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
sizeof(thread_t) * thread_count);
return static_cast<size_t>(thread_count);
} else {
@@ -141,7 +132,7 @@ size_t GetThreadCount() {
}
#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
- GTEST_OS_NETBSD
+ GTEST_OS_NETBSD
#if GTEST_OS_NETBSD
#undef KERN_PROC
@@ -184,12 +175,12 @@ size_t GetThreadCount() {
// we cannot detect it.
size_t GetThreadCount() {
int mib[] = {
- CTL_KERN,
- KERN_PROC,
- KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
- getpid(),
- sizeof(struct kinfo_proc),
- 0,
+ CTL_KERN,
+ KERN_PROC,
+ KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+ getpid(),
+ sizeof(struct kinfo_proc),
+ 0,
};
u_int miblen = sizeof(mib) / sizeof(mib[0]);
@@ -210,8 +201,7 @@ size_t GetThreadCount() {
// exclude empty members
size_t nthreads = 0;
for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
- if (info[i].p_tid != -1)
- nthreads++;
+ if (info[i].p_tid != -1) nthreads++;
}
return nthreads;
}
@@ -254,13 +244,9 @@ size_t GetThreadCount() {
size_t GetThreadCount() {
int dummy_buffer;
size_t avail;
- zx_status_t status = zx_object_get_info(
- zx_process_self(),
- ZX_INFO_PROCESS_THREADS,
- &dummy_buffer,
- 0,
- nullptr,
- &avail);
+ zx_status_t status =
+ zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+ &dummy_buffer, 0, nullptr, &avail);
if (status == ZX_OK) {
return avail;
} else {
@@ -280,27 +266,15 @@ size_t GetThreadCount() {
#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
-void SleepMilliseconds(int n) {
- ::Sleep(static_cast<DWORD>(n));
-}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-AutoHandle::AutoHandle()
- : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
-AutoHandle::AutoHandle(Handle handle)
- : handle_(handle) {}
+AutoHandle::~AutoHandle() { Reset(); }
-AutoHandle::~AutoHandle() {
- Reset();
-}
-
-AutoHandle::Handle AutoHandle::Get() const {
- return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
-void AutoHandle::Reset() {
- Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
void AutoHandle::Reset(HANDLE handle) {
// Resetting with the same handle we already own is invalid.
@@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) {
} else {
GTEST_CHECK_(!IsCloseable())
<< "Resetting a valid handle to itself is likely a programmer error "
- "and thus not allowed.";
+ "and thus not allowed.";
}
}
@@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const {
return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
}
-Notification::Notification()
- : event_(::CreateEvent(nullptr, // Default security attributes.
- TRUE, // Do not reset automatically.
- FALSE, // Initially unset.
- nullptr)) { // Anonymous event.
- GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
- GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
- GTEST_CHECK_(
- ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
Mutex::Mutex()
: owner_thread_id_(0),
type_(kDynamic),
@@ -391,25 +348,25 @@ namespace {
// MemoryIsNotDeallocated memory_is_not_deallocated;
// critical_section_ = new CRITICAL_SECTION;
//
-class MemoryIsNotDeallocated
-{
+class MemoryIsNotDeallocated {
public:
MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
// Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
// doesn't report mem leak if there's no matching deallocation.
- _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+ (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
}
~MemoryIsNotDeallocated() {
// Restore the original _CRTDBG_ALLOC_MEM_DF flag
- _CrtSetDbgFlag(old_crtdbg_flag_);
+ (void)_CrtSetDbgFlag(old_crtdbg_flag_);
}
private:
int old_crtdbg_flag_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+ MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+ MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
};
#endif // _MSC_VER
@@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() {
::InitializeCriticalSection(critical_section_);
// Updates the critical_section_init_phase_ to 2 to signal
// initialization complete.
- GTEST_CHECK_(::InterlockedCompareExchange(
- &critical_section_init_phase_, 2L, 1L) ==
- 1L);
+ GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+ 2L, 1L) == 1L);
break;
case 1:
// Somebody else is already initializing the mutex; spin until they
// are done.
- while (::InterlockedCompareExchange(&critical_section_init_phase_,
- 2L,
+ while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
2L) != 2L) {
// Possibly yields the rest of the thread's time slice to other
// threads.
@@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
private:
struct ThreadMainParam {
ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
- : runnable_(runnable),
- thread_can_start_(thread_can_start) {
- }
+ : runnable_(runnable), thread_can_start_(thread_can_start) {}
std::unique_ptr<Runnable> runnable_;
// Does not own.
Notification* thread_can_start_;
@@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
// Prohibit instantiation.
ThreadWithParamSupport();
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+ ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+ ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
};
} // namespace
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
Notification* thread_can_start)
- : thread_(ThreadWithParamSupport::CreateThread(runnable,
- thread_can_start)) {
-}
+ : thread_(
+ ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
-ThreadWithParamBase::~ThreadWithParamBase() {
- Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
void ThreadWithParamBase::Join() {
GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl {
ThreadIdToThreadLocals::iterator thread_local_pos =
thread_to_thread_locals->find(current_thread);
if (thread_local_pos == thread_to_thread_locals->end()) {
- thread_local_pos = thread_to_thread_locals->insert(
- std::make_pair(current_thread, ThreadLocalValues())).first;
+ thread_local_pos =
+ thread_to_thread_locals
+ ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+ .first;
StartWatcherThreadFor(current_thread);
}
ThreadLocalValues& thread_local_values = thread_local_pos->second;
@@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl {
ThreadIdToThreadLocals* const thread_to_thread_locals =
GetThreadLocalsMapLocked();
for (ThreadIdToThreadLocals::iterator it =
- thread_to_thread_locals->begin();
- it != thread_to_thread_locals->end();
- ++it) {
+ thread_to_thread_locals->begin();
+ it != thread_to_thread_locals->end(); ++it) {
ThreadLocalValues& thread_local_values = it->second;
ThreadLocalValues::iterator value_pos =
thread_local_values.find(thread_local_instance);
@@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl {
if (thread_local_pos != thread_to_thread_locals->end()) {
ThreadLocalValues& thread_local_values = thread_local_pos->second;
for (ThreadLocalValues::iterator value_pos =
- thread_local_values.begin();
- value_pos != thread_local_values.end();
- ++value_pos) {
+ thread_local_values.begin();
+ value_pos != thread_local_values.end(); ++value_pos) {
value_holders.push_back(value_pos->second);
}
thread_to_thread_locals->erase(thread_local_pos);
@@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl {
static void StartWatcherThreadFor(DWORD thread_id) {
// The returned handle will be kept in thread_map and closed by
// watcher_thread in WatcherThreadFunc.
- HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
- FALSE,
- thread_id);
+ HANDLE thread =
+ ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
GTEST_CHECK_(thread != nullptr);
// We need to pass a valid thread ID pointer into CreateThread for it
// to work correctly under Win98.
@@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl {
&ThreadLocalRegistryImpl::WatcherThreadFunc,
reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
CREATE_SUSPENDED, &watcher_thread_id);
- GTEST_CHECK_(watcher_thread != nullptr);
+ GTEST_CHECK_(watcher_thread != nullptr)
+ << "CreateThread failed with error " << ::GetLastError() << ".";
// Give the watcher thread the same priority as ours to avoid being
// blocked by it.
::SetThreadPriority(watcher_thread,
@@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl {
static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
const ThreadIdAndHandle* tah =
reinterpret_cast<const ThreadIdAndHandle*>(param);
- GTEST_CHECK_(
- ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+ GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
OnThreadExit(tah->first);
::CloseHandle(tah->second);
delete tah;
@@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl {
};
Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+ Mutex::kStaticMutex); // NOLINT
ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
- const ThreadLocalBase* thread_local_instance) {
+ const ThreadLocalBase* thread_local_instance) {
return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
thread_local_instance);
}
void ThreadLocalRegistry::OnThreadLocalDestroyed(
- const ThreadLocalBase* thread_local_instance) {
+ const ThreadLocalBase* thread_local_instance) {
ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
}
@@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
bool IsAsciiWordChar(char ch) {
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
- ('0' <= ch && ch <= '9') || ch == '_';
+ ('0' <= ch && ch <= '9') || ch == '_';
}
// Returns true if and only if "\\c" is a supported escape sequence.
@@ -799,17 +750,28 @@ bool IsValidEscape(char c) {
bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
if (escaped) { // "\\p" where p is pattern_char.
switch (pattern_char) {
- case 'd': return IsAsciiDigit(ch);
- case 'D': return !IsAsciiDigit(ch);
- case 'f': return ch == '\f';
- case 'n': return ch == '\n';
- case 'r': return ch == '\r';
- case 's': return IsAsciiWhiteSpace(ch);
- case 'S': return !IsAsciiWhiteSpace(ch);
- case 't': return ch == '\t';
- case 'v': return ch == '\v';
- case 'w': return IsAsciiWordChar(ch);
- case 'W': return !IsAsciiWordChar(ch);
+ case 'd':
+ return IsAsciiDigit(ch);
+ case 'D':
+ return !IsAsciiDigit(ch);
+ case 'f':
+ return ch == '\f';
+ case 'n':
+ return ch == '\n';
+ case 'r':
+ return ch == '\r';
+ case 's':
+ return IsAsciiWhiteSpace(ch);
+ case 'S':
+ return !IsAsciiWhiteSpace(ch);
+ case 't':
+ return ch == '\t';
+ case 'v':
+ return ch == '\v';
+ case 'w':
+ return IsAsciiWordChar(ch);
+ case 'W':
+ return !IsAsciiWordChar(ch);
}
return IsAsciiPunct(pattern_char) && pattern_char == ch;
}
@@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
// Helper function used by ValidateRegex() to format error messages.
static std::string FormatRegexSyntaxError(const char* regex, int index) {
return (Message() << "Syntax error at index " << index
- << " in simple regular expression \"" << regex << "\": ").GetString();
+ << " in simple regular expression \"" << regex << "\": ")
+ .GetString();
}
// Generates non-fatal failures and returns false if regex is invalid;
@@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) {
<< "'$' can only appear at the end.";
is_valid = false;
} else if (IsInSet(ch, "()[]{}|")) {
- ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
- << "'" << ch << "' is unsupported.";
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+ << "' is unsupported.";
is_valid = false;
} else if (IsRepeat(ch) && !prev_repeatable) {
- ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
- << "'" << ch << "' can only follow a repeatable token.";
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+ << "' can only follow a repeatable token.";
is_valid = false;
}
@@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) {
// characters to be indexable by size_t, in which case the test will
// probably time out anyway. We are fine with this limitation as
// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
- bool escaped, char c, char repeat, const char* regex,
- const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+ const char* regex, const char* str) {
const size_t min_count = (repeat == '+') ? 1 : 0;
- const size_t max_count = (repeat == '?') ? 1 :
- static_cast<size_t>(-1) - 1;
+ const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
// We cannot call numeric_limits::max() as it conflicts with the
// max() macro on Windows.
@@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead(
// greedy match.
return true;
}
- if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
- return false;
+ if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
}
return false;
}
@@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
// "$" only matches the end of a string. Note that regex being
// valid guarantees that there's nothing after "$" in it.
- if (*regex == '$')
- return *str == '\0';
+ if (*regex == '$') return *str == '\0';
// Is the first thing in regex an escape sequence?
const bool escaped = *regex == '\\';
- if (escaped)
- ++regex;
+ if (escaped) ++regex;
if (IsRepeat(regex[1])) {
// MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
// here's an indirect recursion. It terminates as the regex gets
// shorter in each recursion.
- return MatchRepetitionAndRegexAtHead(
- escaped, regex[0], regex[1], regex + 2, str);
+ return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+ str);
} else {
// regex isn't empty, isn't "$", and doesn't start with a
// repetition. We match the first atom of regex with the first
// character of str and recurse.
return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
- MatchRegexAtHead(regex + 1, str + 1);
+ MatchRegexAtHead(regex + 1, str + 1);
}
}
@@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
bool MatchRegexAnywhere(const char* regex, const char* str) {
if (regex == nullptr || str == nullptr) return false;
- if (*regex == '^')
- return MatchRegexAtHead(regex + 1, str);
+ if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
// A successful match can be anywhere in str.
do {
- if (MatchRegexAtHead(regex, str))
- return true;
+ if (MatchRegexAtHead(regex, str)) return true;
} while (*str++ != '\0');
return false;
}
@@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
// FormatFileLocation in order to contrast the two functions.
// Note that FormatCompilerIndependentFileLocation() does NOT append colon
// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
- const char* file, int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+ int line) {
const std::string file_name(file == nullptr ? kUnknownFile : file);
if (line < 0)
@@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
: severity_(severity) {
- const char* const marker =
- severity == GTEST_INFO ? "[ INFO ]" :
- severity == GTEST_WARNING ? "[WARNING]" :
- severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]";
- GetStream() << ::std::endl << marker << " "
- << FormatFileLocation(file, line).c_str() << ": ";
+ const char* const marker = severity == GTEST_INFO ? "[ INFO ]"
+ : severity == GTEST_WARNING ? "[WARNING]"
+ : severity == GTEST_ERROR ? "[ ERROR ]"
+ : "[ FATAL ]";
+ GetStream() << ::std::endl
+ << marker << " " << FormatFileLocation(file, line).c_str()
+ << ": ";
}
// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1078,27 +1035,26 @@ class CapturedStream {
public:
// The ctor redirects the stream to a temporary file.
explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
- char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT
- char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT
+#if GTEST_OS_WINDOWS
+ char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT
+ char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT
::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
- const UINT success = ::GetTempFileNameA(temp_dir_path,
- "gtest_redir",
+ const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
0, // Generate unique file name.
temp_file_path);
GTEST_CHECK_(success != 0)
<< "Unable to create a temporary file in " << temp_dir_path;
const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
- GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
- << temp_file_path;
+ GTEST_CHECK_(captured_fd != -1)
+ << "Unable to open temporary file " << temp_file_path;
filename_ = temp_file_path;
-# else
+#else
// There's no guarantee that a test has write access to the current
// directory, so we create the temporary file in a temporary directory.
std::string name_template;
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
// Note: Android applications are expected to call the framework's
// Context.getExternalStorageDirectory() method through JNI to get
// the location of the world-writable SD Card directory. However,
@@ -1111,7 +1067,7 @@ class CapturedStream {
// '/sdcard' and other variants cannot be relied on, as they are not
// guaranteed to be mounted, or may have a delay in mounting.
name_template = "/data/local/tmp/";
-# elif GTEST_OS_IOS
+#elif GTEST_OS_IOS
char user_temp_dir[PATH_MAX + 1];
// Documented alternative to NSTemporaryDirectory() (for obtaining creating
@@ -1132,9 +1088,9 @@ class CapturedStream {
name_template = user_temp_dir;
if (name_template.back() != GTEST_PATH_SEP_[0])
name_template.push_back(GTEST_PATH_SEP_[0]);
-# else
+#else
name_template = "/tmp/";
-# endif
+#endif
name_template.append("gtest_captured_stream.XXXXXX");
// mkstemp() modifies the string bytes in place, and does not go beyond the
@@ -1150,15 +1106,13 @@ class CapturedStream {
<< " for test; does the test have access to the /tmp directory?";
}
filename_ = std::move(name_template);
-# endif // GTEST_OS_WINDOWS
+#endif // GTEST_OS_WINDOWS
fflush(nullptr);
dup2(captured_fd, fd_);
close(captured_fd);
}
- ~CapturedStream() {
- remove(filename_.c_str());
- }
+ ~CapturedStream() { remove(filename_.c_str()); }
std::string GetCapturedString() {
if (uncaptured_fd_ != -1) {
@@ -1185,7 +1139,8 @@ class CapturedStream {
// Name of the temporary file holding the stderr output.
::std::string filename_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+ CapturedStream(const CapturedStream&) = delete;
+ CapturedStream& operator=(const CapturedStream&) = delete;
};
GTEST_DISABLE_MSC_DEPRECATED_POP_()
@@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) {
return content;
}
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif // defined(_MSC_VER) || defined(__BORLANDC__)
+
// Starts capturing stdout.
void CaptureStdout() {
CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
@@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() {
#endif // GTEST_HAS_STREAM_REDIRECTION
-
-
-
-
size_t GetFileSize(FILE* file) {
fseek(file, 0, SEEK_END);
return static_cast<size_t>(ftell(file));
@@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) {
// Keeps reading the file until we cannot read further or the
// pre-determined file size is reached.
do {
- bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+ bytes_last_read =
+ fread(buffer + bytes_read, 1, file_size - bytes_read, file);
bytes_read += bytes_last_read;
} while (bytes_last_read > 0 && bytes_read < file_size);
@@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
// LONG_MAX or LONG_MIN when the input overflows.)
result != long_value
// The parsed value overflows as an int32_t.
- ) {
+ ) {
Message msg;
msg << "WARNING: " << src_text
<< " is expected to be a 32-bit integer, but actually"
@@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
}
int32_t result = default_value;
- if (!ParseInt32(Message() << "Environment variable " << env_var,
- string_value, &result)) {
+ if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+ &result)) {
printf("The default value %s is used.\n",
(Message() << default_value).GetString().c_str());
fflush(stdout);
@@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
// not check that the flag is 'output'
// In essence this checks an env variable called XML_OUTPUT_FILE
// and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
+std::string OutputFlagAlsoCheckEnvVar() {
std::string default_value_for_output_flag = "";
const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
if (nullptr != xml_output_file_env) {
diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc
index 1b68fcb50..f3976d230 100644
--- a/third_party/googletest/src/src/gtest-printers.cc
+++ b/third_party/googletest/src/src/gtest-printers.cc
@@ -27,7 +27,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
// Google Test - The Google C++ Testing and Mocking Framework
//
// This file implements a universal value printer that can print a
@@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
*os << " ... ";
// Rounds up to 2-byte boundary.
- const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+ const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
}
*os << ">";
@@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
// - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
// - as a hexadecimal escape sequence (e.g. '\x7F'), or
// - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
- kAsIs,
- kHexEscape,
- kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
// Returns true if c is a printable ASCII character. We test the
// value of c directly instead of calling isprint(), which is buggy on
@@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
}
}
-static const char* GetCharWidthPrefix(char) {
- return "";
-}
+static const char* GetCharWidthPrefix(char) { return ""; }
-static const char* GetCharWidthPrefix(signed char) {
- return "";
-}
+static const char* GetCharWidthPrefix(signed char) { return ""; }
-static const char* GetCharWidthPrefix(unsigned char) {
- return "";
-}
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
#ifdef __cpp_char8_t
-static const char* GetCharWidthPrefix(char8_t) {
- return "u8";
-}
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
#endif
-static const char* GetCharWidthPrefix(char16_t) {
- return "u";
-}
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
-static const char* GetCharWidthPrefix(char32_t) {
- return "U";
-}
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
-static const char* GetCharWidthPrefix(wchar_t) {
- return "L";
-}
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
// Prints a char c as if it's part of a string literal, escaping it when
// necessary; returns how c was formatted.
@@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
// To aid user debugging, we also print c's code in decimal, unless
// it's 0 (in which case c was printed as '\\0', making the code
// obvious).
- if (c == 0)
- return;
+ if (c == 0) return;
*os << " (" << static_cast<int>(c);
// For more convenience, we print c's code again in hexadecimal,
@@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) {
<< static_cast<uint32_t>(c);
}
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+ if (v == 0) {
+ *os << "0";
+ return;
+ }
+
+ // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+ char buf[40];
+ char* p = buf + sizeof(buf);
+
+ // Some configurations have a __uint128_t, but no support for built in
+ // division. Do manual long division instead.
+
+ uint64_t high = static_cast<uint64_t>(v >> 64);
+ uint64_t low = static_cast<uint64_t>(v);
+
+ *--p = 0;
+ while (high != 0 || low != 0) {
+ uint64_t high_mod = high % 10;
+ high = high / 10;
+ // This is the long division algorithm specialized for a divisor of 10 and
+ // only two elements.
+ // Notable values:
+ // 2^64 / 10 == 1844674407370955161
+ // 2^64 % 10 == 6
+ const uint64_t carry = 6 * high_mod + low % 10;
+ low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+ char digit = static_cast<char>(carry % 10);
+ *--p = '0' + digit;
+ }
+ *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+ __uint128_t uv = static_cast<__uint128_t>(v);
+ if (v < 0) {
+ *os << "-";
+ uv = -uv;
+ }
+ PrintTo(uv, os);
+}
+#endif // __SIZEOF_INT128__
+
// Prints the given array of characters to the ostream. CharType must be either
// char, char8_t, char16_t, char32_t, or wchar_t.
// The array starts at begin, the length is len, it may include '\0' characters
// and may not be NUL-terminated.
template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
- const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+ PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
const char* const quote_prefix = GetCharWidthPrefix(*begin);
*os << quote_prefix << "\"";
bool is_previous_hex = false;
@@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo(
// Prints a (const) char/wchar_t array of 'len' elements, starting at address
// 'begin'. CharType must be either char or wchar_t.
template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
- const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+ UniversalPrintCharArray(const CharType* begin, size_t len,
+ ostream* os) {
// The code
// const char kFoo[] = "foo";
// generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
namespace {
bool ContainsUnprintableControlCodes(const char* str, size_t length) {
- const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+ const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
for (size_t i = 0; i < length; i++) {
unsigned char ch = *s++;
if (std::iscntrl(ch)) {
- switch (ch) {
+ switch (ch) {
case '\t':
case '\n':
case '\r':
break;
default:
return true;
- }
}
+ }
}
return false;
}
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
bool IsValidUTF8(const char* str, size_t length) {
- const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+ const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
for (size_t i = 0; i < length;) {
unsigned char lead = s[i++];
@@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) {
} else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
++i; // 2-byte character
} else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
- IsUTF8TrailByte(s[i]) &&
- IsUTF8TrailByte(s[i + 1]) &&
+ IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
// check for non-shortest form and surrogate
(lead != 0xe0 || s[i] >= 0xa0) &&
(lead != 0xed || s[i] < 0xa0)) {
i += 2; // 3-byte character
} else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
- IsUTF8TrailByte(s[i]) &&
- IsUTF8TrailByte(s[i + 1]) &&
+ IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
IsUTF8TrailByte(s[i + 2]) &&
// check for non-shortest form
(lead != 0xf0 || s[i] >= 0x90) &&
@@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
void PrintStringTo(const ::std::string& s, ostream* os) {
if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
- if (GTEST_FLAG(print_utf8)) {
+ if (GTEST_FLAG_GET(print_utf8)) {
ConditionalPrintAsText(s.data(), s.size(), os);
}
}
diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc
index a938683ce..eb7c8d1cf 100644
--- a/third_party/googletest/src/src/gtest-test-part.cc
+++ b/third_party/googletest/src/src/gtest-test-part.cc
@@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
return os << internal::FormatFileLocation(result.file_name(),
result.line_number())
<< " "
- << (result.type() == TestPartResult::kSuccess
- ? "Success"
- : result.type() == TestPartResult::kSkip
- ? "Skipped"
- : result.type() == TestPartResult::kFatalFailure
- ? "Fatal failure"
- : "Non-fatal failure")
+ << (result.type() == TestPartResult::kSuccess ? "Success"
+ : result.type() == TestPartResult::kSkip ? "Skipped"
+ : result.type() == TestPartResult::kFatalFailure
+ ? "Fatal failure"
+ : "Non-fatal failure")
<< ":\n"
<< result.message() << std::endl;
}
@@ -86,8 +84,8 @@ namespace internal {
HasNewFatalFailureHelper::HasNewFatalFailureHelper()
: has_new_fatal_failure_(false),
- original_reporter_(GetUnitTestImpl()->
- GetTestPartResultReporterForCurrentThread()) {
+ original_reporter_(
+ GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
}
@@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
void HasNewFatalFailureHelper::ReportTestPartResult(
const TestPartResult& result) {
- if (result.fatally_failed())
- has_new_fatal_failure_ = true;
+ if (result.fatally_failed()) has_new_fatal_failure_ = true;
original_reporter_->ReportTestPartResult(result);
}
diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc
index c02c3df65..a2828b83c 100644
--- a/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/third_party/googletest/src/src/gtest-typed-test.cc
@@ -27,7 +27,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
#include "gtest/gtest-typed-test.h"
#include "gtest/gtest.h"
@@ -38,8 +37,7 @@ namespace internal {
// Skips to the first non-space char in str. Returns an empty string if str
// contains only whitespace characters.
static const char* SkipSpaces(const char* str) {
- while (IsSpace(*str))
- str++;
+ while (IsSpace(*str)) str++;
return str;
}
@@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
}
for (RegisteredTestIter it = registered_tests_.begin();
- it != registered_tests_.end();
- ++it) {
+ it != registered_tests_.end(); ++it) {
if (tests.count(it->first) == 0) {
errors << "You forgot to list test " << it->first << ".\n";
}
diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc
index 21c611aff..6f31dd226 100644
--- a/third_party/googletest/src/src/gtest.cc
+++ b/third_party/googletest/src/src/gtest.cc
@@ -31,8 +31,6 @@
// The Google C++ Testing and Mocking Framework (Google Test)
#include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
#include <ctype.h>
#include <stdarg.h>
@@ -46,79 +44,87 @@
#include <chrono> // NOLINT
#include <cmath>
#include <cstdint>
+#include <initializer_list>
#include <iomanip>
+#include <iterator>
#include <limits>
#include <list>
#include <map>
#include <ostream> // NOLINT
#include <sstream>
+#include <unordered_set>
#include <vector>
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
#if GTEST_OS_LINUX
-# include <fcntl.h> // NOLINT
-# include <limits.h> // NOLINT
-# include <sched.h> // NOLINT
+#include <fcntl.h> // NOLINT
+#include <limits.h> // NOLINT
+#include <sched.h> // NOLINT
// Declares vsnprintf(). This header is not available on Windows.
-# include <strings.h> // NOLINT
-# include <sys/mman.h> // NOLINT
-# include <sys/time.h> // NOLINT
-# include <unistd.h> // NOLINT
-# include <string>
+#include <strings.h> // NOLINT
+#include <sys/mman.h> // NOLINT
+#include <sys/time.h> // NOLINT
+#include <unistd.h> // NOLINT
+
+#include <string>
#elif GTEST_OS_ZOS
-# include <sys/time.h> // NOLINT
+#include <sys/time.h> // NOLINT
// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h> // NOLINT
+#include <strings.h> // NOLINT
#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE.
-# include <windows.h> // NOLINT
-# undef min
+#include <windows.h> // NOLINT
+#undef min
#elif GTEST_OS_WINDOWS // We are on Windows proper.
-# include <windows.h> // NOLINT
-# undef min
+#include <windows.h> // NOLINT
+#undef min
#ifdef _MSC_VER
-# include <crtdbg.h> // NOLINT
+#include <crtdbg.h> // NOLINT
#endif
-# include <io.h> // NOLINT
-# include <sys/timeb.h> // NOLINT
-# include <sys/types.h> // NOLINT
-# include <sys/stat.h> // NOLINT
+#include <io.h> // NOLINT
+#include <sys/stat.h> // NOLINT
+#include <sys/timeb.h> // NOLINT
+#include <sys/types.h> // NOLINT
-# if GTEST_OS_WINDOWS_MINGW
-# include <sys/time.h> // NOLINT
-# endif // GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h> // NOLINT
+#endif // GTEST_OS_WINDOWS_MINGW
#else
// cpplint thinks that the header is already included, so we want to
// silence it.
-# include <sys/time.h> // NOLINT
-# include <unistd.h> // NOLINT
+#include <sys/time.h> // NOLINT
+#include <unistd.h> // NOLINT
#endif // GTEST_OS_LINUX
#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
#endif
#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h> // NOLINT
-# include <netdb.h> // NOLINT
-# include <sys/socket.h> // NOLINT
-# include <sys/types.h> // NOLINT
+#include <arpa/inet.h> // NOLINT
+#include <netdb.h> // NOLINT
+#include <sys/socket.h> // NOLINT
+#include <sys/types.h> // NOLINT
#endif
#include "src/gtest-internal-inl.h"
#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
#endif // GTEST_OS_WINDOWS
#if GTEST_OS_MAC
@@ -131,7 +137,10 @@
#include "absl/debugging/failure_signal_handler.h"
#include "absl/debugging/stacktrace.h"
#include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
#endif // GTEST_HAS_ABSL
namespace testing {
@@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
// is specified on the command line.
bool g_help_flag = false;
-// Utilty function to Open File for Writing
+// Utility function to Open File for Writing
static FILE* OpenFileForWriting(const std::string& output_file) {
FILE* fileout = nullptr;
FilePath output_file_path(output_file);
@@ -216,28 +225,33 @@ static bool GetDefaultFailFast() {
return false;
}
+} // namespace testing
+
GTEST_DEFINE_bool_(
- fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+ fail_fast,
+ testing::internal::BoolFromGTestEnv("fail_fast",
+ testing::GetDefaultFailFast()),
"True if and only if a test failure should stop further test execution.");
GTEST_DEFINE_bool_(
also_run_disabled_tests,
- internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+ testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
"Run disabled tests too, in addition to the tests normally being run.");
GTEST_DEFINE_bool_(
- break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+ break_on_failure,
+ testing::internal::BoolFromGTestEnv("break_on_failure", false),
"True if and only if a failed assertion should be a debugger "
"break-point.");
GTEST_DEFINE_bool_(catch_exceptions,
- internal::BoolFromGTestEnv("catch_exceptions", true),
+ testing::internal::BoolFromGTestEnv("catch_exceptions",
+ true),
"True if and only if " GTEST_NAME_
" should catch exceptions and treat them as test failures.");
GTEST_DEFINE_string_(
- color,
- internal::StringFromGTestEnv("color", "auto"),
+ color, testing::internal::StringFromGTestEnv("color", "auto"),
"Whether to use colors in the output. Valid values: yes, no, "
"and auto. 'auto' means to use colors if the output is "
"being sent to a terminal and the TERM environment variable "
@@ -245,7 +259,8 @@ GTEST_DEFINE_string_(
GTEST_DEFINE_string_(
filter,
- internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+ testing::internal::StringFromGTestEnv("filter",
+ testing::GetDefaultFilter()),
"A colon-separated list of glob (not regex) patterns "
"for filtering the tests to run, optionally followed by a "
"'-' and a : separated list of negative patterns (tests to "
@@ -254,13 +269,14 @@ GTEST_DEFINE_string_(
GTEST_DEFINE_bool_(
install_failure_signal_handler,
- internal::BoolFromGTestEnv("install_failure_signal_handler", false),
- "If true and supported on the current platform, " GTEST_NAME_ " should "
+ testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+ false),
+ "If true and supported on the current platform, " GTEST_NAME_
+ " should "
"install a signal handler that dumps debugging information when fatal "
"signals are raised.");
-GTEST_DEFINE_bool_(list_tests, false,
- "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
// The net priority order after flag processing is thus:
// --gtest_output command line flag
@@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false,
// ''
GTEST_DEFINE_string_(
output,
- internal::StringFromGTestEnv("output",
- internal::OutputFlagAlsoCheckEnvVar().c_str()),
+ testing::internal::StringFromGTestEnv(
+ "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
"A format (defaults to \"xml\" but can be specified to be \"json\"), "
"optionally followed by a colon and an output file name or directory. "
"A directory is indicated by a trailing pathname separator. "
@@ -281,65 +297,79 @@ GTEST_DEFINE_string_(
"digits.");
GTEST_DEFINE_bool_(
- brief, internal::BoolFromGTestEnv("brief", false),
+ brief, testing::internal::BoolFromGTestEnv("brief", false),
"True if only test failures should be displayed in text output.");
-GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+GTEST_DEFINE_bool_(print_time,
+ testing::internal::BoolFromGTestEnv("print_time", true),
"True if and only if " GTEST_NAME_
" should display elapsed time in text output.");
-GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+GTEST_DEFINE_bool_(print_utf8,
+ testing::internal::BoolFromGTestEnv("print_utf8", true),
"True if and only if " GTEST_NAME_
" prints UTF8 characters as text.");
GTEST_DEFINE_int32_(
- random_seed,
- internal::Int32FromGTestEnv("random_seed", 0),
+ random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
"Random number seed to use when shuffling test orders. Must be in range "
"[1, 99999], or 0 to use a seed based on the current time.");
GTEST_DEFINE_int32_(
- repeat,
- internal::Int32FromGTestEnv("repeat", 1),
+ repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
"How many times to repeat each test. Specify a negative number "
"for repeating forever. Useful for shaking out flaky tests.");
+GTEST_DEFINE_bool_(
+ recreate_environments_when_repeating,
+ testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+ false),
+ "Controls whether global test environments are recreated for each repeat "
+ "of the tests. If set to false the global test environments are only set "
+ "up once, for the first iteration, and only torn down once, for the last. "
+ "Useful for shaking out flaky tests with stable, expensive test "
+ "environments. If --gtest_repeat is set to a negative number, meaning "
+ "there is no last run, the environments will always be recreated to avoid "
+ "leaks.");
+
GTEST_DEFINE_bool_(show_internal_stack_frames, false,
"True if and only if " GTEST_NAME_
" should include internal stack frames when "
"printing test failure stack traces.");
-GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+GTEST_DEFINE_bool_(shuffle,
+ testing::internal::BoolFromGTestEnv("shuffle", false),
"True if and only if " GTEST_NAME_
" should randomize tests' order on every run.");
GTEST_DEFINE_int32_(
stack_trace_depth,
- internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+ testing::internal::Int32FromGTestEnv("stack_trace_depth",
+ testing::kMaxStackTraceDepth),
"The maximum number of stack frames to print when an "
"assertion fails. The valid range is 0 through 100, inclusive.");
GTEST_DEFINE_string_(
stream_result_to,
- internal::StringFromGTestEnv("stream_result_to", ""),
+ testing::internal::StringFromGTestEnv("stream_result_to", ""),
"This flag specifies the host name and the port number on which to stream "
"test results. Example: \"localhost:555\". The flag is effective only on "
"Linux.");
GTEST_DEFINE_bool_(
throw_on_failure,
- internal::BoolFromGTestEnv("throw_on_failure", false),
+ testing::internal::BoolFromGTestEnv("throw_on_failure", false),
"When this flag is specified, a failed assertion will throw an exception "
"if exceptions are enabled or exit the program with a non-zero code "
"otherwise. For use with an external test framework.");
#if GTEST_USE_OWN_FLAGFILE_FLAG_
GTEST_DEFINE_string_(
- flagfile,
- internal::StringFromGTestEnv("flagfile", ""),
+ flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
"This flag specifies the flagfile to read command-line flags from.");
#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+namespace testing {
namespace internal {
// Generates a random number from [0, range), using a Linear
@@ -348,10 +378,9 @@ namespace internal {
uint32_t Random::Generate(uint32_t range) {
// These constants are the same as are used in glibc's rand(3).
// Use wider types than necessary to prevent unsigned overflow diagnostics.
- state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange;
+ state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
- GTEST_CHECK_(range > 0)
- << "Cannot generate a number in the range [0, 0).";
+ GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
GTEST_CHECK_(range <= kMaxRange)
<< "Generation of a number in [0, " << range << ") was requested, "
<< "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) {
}
// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
- const char* file,
- int line,
- const char* message)
- : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+ int line, const char* message)
+ : data_(new AssertHelperData(type, file, line, message)) {}
-AssertHelper::~AssertHelper() {
- delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
// Message assignment, for assertion streaming support.
void AssertHelper::operator=(const Message& message) const {
- UnitTest::GetInstance()->
- AddTestPartResult(data_->type, data_->file, data_->line,
- AppendUserMessage(data_->message, message),
- UnitTest::GetInstance()->impl()
- ->CurrentOsStackTraceExceptTop(1)
- // Skips the stack frame for this function itself.
- ); // NOLINT
+ UnitTest::GetInstance()->AddTestPartResult(
+ data_->type, data_->file, data_->line,
+ AppendUserMessage(data_->message, message),
+ UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+ // Skips the stack frame for this function itself.
+ ); // NOLINT
}
namespace {
// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
-// to creates test cases for it, a syntetic test case is
+// to creates test cases for it, a synthetic test case is
// inserted to report ether an error or a log message.
//
// This configuration bit will likely be removed at some point.
@@ -452,7 +475,6 @@ class FailureTest : public Test {
const bool as_error_;
};
-
} // namespace
std::set<std::string>* GetIgnoredParameterizedTestSuites() {
@@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
"To suppress this error for this test suite, insert the following line "
"(in a non-header) in the namespace it is defined in:"
"\n\n"
- "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
+ "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+ name + ");";
std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
RegisterTest( //
@@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
}
void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
- GetUnitTestImpl()
- ->type_parameterized_test_registry()
- .RegisterInstantiation(case_name);
+ GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+ case_name);
}
void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
const char* test_suite_name, CodeLocation code_location) {
suites_.emplace(std::string(test_suite_name),
- TypeParameterizedTestSuiteInfo(code_location));
+ TypeParameterizedTestSuiteInfo(code_location));
}
void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
- const char* test_suite_name) {
+ const char* test_suite_name) {
auto it = suites_.find(std::string(test_suite_name));
if (it != suites_.end()) {
it->second.instantiated = true;
@@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() {
// Returns the output format, or "" for normal printed output.
std::string UnitTestOptions::GetOutputFormat() {
- const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+ std::string s = GTEST_FLAG_GET(output);
+ const char* const gtest_output_flag = s.c_str();
const char* const colon = strchr(gtest_output_flag, ':');
return (colon == nullptr)
? std::string(gtest_output_flag)
@@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() {
// Returns the name of the requested output file, or the default if none
// was explicitly specified.
std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
- const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+ std::string s = GTEST_FLAG_GET(output);
+ const char* const gtest_output_flag = s.c_str();
std::string format = GetOutputFormat();
- if (format.empty())
- format = std::string(kDefaultOutputFormat);
+ if (format.empty()) format = std::string(kDefaultOutputFormat);
const char* const colon = strchr(gtest_output_flag, ':');
if (colon == nullptr)
return internal::FilePath::MakeFileName(
- internal::FilePath(
- UnitTest::GetInstance()->original_working_dir()),
- internal::FilePath(kDefaultOutputFile), 0,
- format.c_str()).string();
+ internal::FilePath(
+ UnitTest::GetInstance()->original_working_dir()),
+ internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+ .string();
internal::FilePath output_name(colon + 1);
if (!output_name.IsAbsolutePath())
@@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
internal::FilePath(colon + 1));
- if (!output_name.IsDirectory())
- return output_name.string();
+ if (!output_name.IsDirectory()) return output_name.string();
internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
output_name, internal::GetCurrentExecutableName(),
@@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str,
return true;
}
-bool UnitTestOptions::MatchesFilter(const std::string& name_str,
- const char* filter) {
- // The filter is a list of patterns separated by colons (:).
- const char* pattern = filter;
- while (true) {
- // Find the bounds of this pattern.
- const char* const next_sep = strchr(pattern, ':');
- const char* const pattern_end =
- next_sep != nullptr ? next_sep : pattern + strlen(pattern);
-
- // Check if this pattern matches name_str.
- if (PatternMatchesString(name_str, pattern, pattern_end)) {
- return true;
- }
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+ return std::any_of(pattern.begin(), pattern.end(),
+ [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+ UnitTestFilter() = default;
+
+ // Constructs a filter from a string of patterns separated by `:`.
+ explicit UnitTestFilter(const std::string& filter) {
+ // By design "" filter matches "" string.
+ std::vector<std::string> all_patterns;
+ SplitString(filter, ':', &all_patterns);
+ const auto exact_match_patterns_begin = std::partition(
+ all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+ glob_patterns_.reserve(static_cast<size_t>(
+ std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+ std::move(all_patterns.begin(), exact_match_patterns_begin,
+ std::inserter(glob_patterns_, glob_patterns_.begin()));
+ std::move(
+ exact_match_patterns_begin, all_patterns.end(),
+ std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+ }
+
+ // Returns true if and only if name matches at least one of the patterns in
+ // the filter.
+ bool MatchesName(const std::string& name) const {
+ return exact_match_patterns_.count(name) > 0 ||
+ std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+ [&name](const std::string& pattern) {
+ return PatternMatchesString(
+ name, pattern.c_str(),
+ pattern.c_str() + pattern.size());
+ });
+ }
+
+ private:
+ std::vector<std::string> glob_patterns_;
+ std::unordered_set<std::string> exact_match_patterns_;
+};
- // Give up on this pattern. However, if we found a pattern separator (:),
- // advance to the next pattern (skipping over the separator) and restart.
- if (next_sep == nullptr) {
- return false;
+class PositiveAndNegativeUnitTestFilter {
+ public:
+ // Constructs a positive and a negative filter from a string. The string
+ // contains a positive filter optionally followed by a '-' character and a
+ // negative filter. In case only a negative filter is provided the positive
+ // filter will be assumed "*".
+ // A filter is a list of patterns separated by ':'.
+ explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+ std::vector<std::string> positive_and_negative_filters;
+
+ // NOTE: `SplitString` always returns a non-empty container.
+ SplitString(filter, '-', &positive_and_negative_filters);
+ const auto& positive_filter = positive_and_negative_filters.front();
+
+ if (positive_and_negative_filters.size() > 1) {
+ positive_filter_ = UnitTestFilter(
+ positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+ // TODO(b/214626361): Fail on multiple '-' characters
+ // For the moment to preserve old behavior we concatenate the rest of the
+ // string parts with `-` as separator to generate the negative filter.
+ auto negative_filter_string = positive_and_negative_filters[1];
+ for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+ negative_filter_string =
+ negative_filter_string + '-' + positive_and_negative_filters[i];
+ negative_filter_ = UnitTestFilter(negative_filter_string);
+ } else {
+ // In case we don't have a negative filter and positive filter is ""
+ // we do not use kUniversalFilter by design as opposed to when we have a
+ // negative filter.
+ positive_filter_ = UnitTestFilter(positive_filter);
}
- pattern = next_sep + 1;
}
- return true;
+
+ // Returns true if and only if test name (this is generated by appending test
+ // suit name and test name via a '.' character) matches the positive filter
+ // and does not match the negative filter.
+ bool MatchesTest(const std::string& test_suite_name,
+ const std::string& test_name) const {
+ return MatchesName(test_suite_name + "." + test_name);
+ }
+
+ // Returns true if and only if name matches the positive filter and does not
+ // match the negative filter.
+ bool MatchesName(const std::string& name) const {
+ return positive_filter_.MatchesName(name) &&
+ !negative_filter_.MatchesName(name);
+ }
+
+ private:
+ UnitTestFilter positive_filter_;
+ UnitTestFilter negative_filter_;
+};
+} // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+ const char* filter) {
+ return UnitTestFilter(filter).MatchesName(name_str);
}
// Returns true if and only if the user-specified filter matches the test
// suite name and the test name.
bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
const std::string& test_name) {
- const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
// Split --gtest_filter at '-', if there is one, to separate into
// positive filter and negative filter portions
- const char* const p = GTEST_FLAG(filter).c_str();
- const char* const dash = strchr(p, '-');
- std::string positive;
- std::string negative;
- if (dash == nullptr) {
- positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter
- negative = "";
- } else {
- positive = std::string(p, dash); // Everything up to the dash
- negative = std::string(dash + 1); // Everything after the dash
- if (positive.empty()) {
- // Treat '-test1' as the same as '*-test1'
- positive = kUniversalFilter;
- }
- }
-
- // A filter is a colon-separated list of patterns. It matches a
- // test if any pattern in it matches the test.
- return (MatchesFilter(full_name, positive.c_str()) &&
- !MatchesFilter(full_name, negative.c_str()));
+ return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+ .MatchesTest(test_suite_name, test_name);
}
#if GTEST_HAS_SEH
@@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
bool should_handle = true;
- if (!GTEST_FLAG(catch_exceptions))
+ if (!GTEST_FLAG_GET(catch_exceptions))
should_handle = false;
else if (exception_code == EXCEPTION_BREAKPOINT)
should_handle = false;
@@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
// results. Intercepts only failures from the current thread.
ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
TestPartResultArray* result)
- : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
- result_(result) {
+ : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
Init();
}
@@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
// results.
ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
InterceptMode intercept_mode, TestPartResultArray* result)
- : intercept_mode_(intercept_mode),
- result_(result) {
+ : intercept_mode_(intercept_mode), result_(result) {
Init();
}
@@ -844,9 +924,7 @@ namespace internal {
// from user test code. GetTestTypeId() is guaranteed to always
// return the same value, as it always calls GetTypeId<>() from the
// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
- return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
// The value of GetTestTypeId() as seen from within the Google Test
// library. This is solely for testing GetTestTypeId().
@@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
const TestPartResultArray& results,
TestPartResult::Type type,
const std::string& substr) {
- const std::string expected(type == TestPartResult::kFatalFailure ?
- "1 fatal failure" :
- "1 non-fatal failure");
+ const std::string expected(type == TestPartResult::kFatalFailure
+ ? "1 fatal failure"
+ : "1 non-fatal failure");
Message msg;
if (results.size() != 1) {
msg << "Expected: " << expected << "\n"
@@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
}
if (strstr(r.message(), substr.c_str()) == nullptr) {
- return AssertionFailure() << "Expected: " << expected << " containing \""
- << substr << "\"\n"
- << " Actual:\n"
- << r;
+ return AssertionFailure()
+ << "Expected: " << expected << " containing \"" << substr << "\"\n"
+ << " Actual:\n"
+ << r;
}
return AssertionSuccess();
@@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() {
}
DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
- UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+ UnitTestImpl* unit_test)
+ : unit_test_(unit_test) {}
void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
const TestPartResult& result) {
@@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
}
DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
- UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+ UnitTestImpl* unit_test)
+ : unit_test_(unit_test) {}
void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
const TestPartResult& result) {
@@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const {
// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
return os_stack_trace_getter()->CurrentStackTrace(
- static_cast<int>(GTEST_FLAG(stack_trace_depth)),
- skip_count + 1
+ static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
// Skips the user-specified number of frames plus this function
// itself.
- ); // NOLINT
+ ); // NOLINT
}
// A helper class for measuring elapsed times.
@@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
const int unicode_length =
MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
WCHAR* unicode = new WCHAR[unicode_length + 1];
- MultiByteToWideChar(CP_ACP, 0, ansi, length,
- unicode, unicode_length);
+ MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
unicode[unicode_length] = 0;
return unicode;
}
@@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
// memory using new. The caller is responsible for deleting the return
// value using delete[]. Returns the ANSI string, or NULL if the
// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
if (!utf16_str) return nullptr;
const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
0, nullptr, nullptr);
@@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
// Unlike strcmp(), this function can handle NULL argument(s). A NULL
// C string is considered different to any non-NULL C string,
// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
+bool String::CStringEquals(const char* lhs, const char* rhs) {
if (lhs == nullptr) return rhs == nullptr;
if (rhs == nullptr) return false;
@@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
// encoding, and streams the result to the given Message object.
static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
Message* msg) {
- for (size_t i = 0; i != length; ) { // NOLINT
+ for (size_t i = 0; i != length;) { // NOLINT
if (wstr[i] != L'\0') {
*msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
- while (i != length && wstr[i] != L'\0')
- i++;
+ while (i != length && wstr[i] != L'\0') i++;
} else {
*msg << '\0';
i++;
@@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) {
// These two overloads allow streaming a wide C string to a Message
// using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message& Message::operator<<(const wchar_t* wide_c_str) {
return *this << internal::String::ShowWideCString(wide_c_str);
}
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message& Message::operator<<(wchar_t* wide_c_str) {
return *this << internal::String::ShowWideCString(wide_c_str);
}
#if GTEST_HAS_STD_WSTRING
// Converts the given wide string to a narrow string using the UTF-8
// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message& Message::operator<<(const ::std::wstring& wstr) {
internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
return *this;
}
@@ -1183,44 +1260,6 @@ std::string Message::GetString() const {
return internal::StringStreamToString(ss_.get());
}
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
- : success_(other.success_),
- message_(other.message_.get() != nullptr
- ? new ::std::string(*other.message_)
- : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
- using std::swap;
- swap(success_, other.success_);
- swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
- AssertionResult negation(!success_);
- if (message_.get() != nullptr) negation << *message_;
- return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
- return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
- return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
- return AssertionFailure() << message;
-}
-
namespace internal {
namespace edit_distance {
@@ -1512,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
AssertionResult EqFailure(const char* lhs_expression,
const char* rhs_expression,
const std::string& lhs_value,
- const std::string& rhs_value,
- bool ignoring_case) {
+ const std::string& rhs_value, bool ignoring_case) {
Message msg;
msg << "Expected equality of these values:";
msg << "\n " << lhs_expression;
@@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression,
}
if (!lhs_value.empty() && !rhs_value.empty()) {
- const std::vector<std::string> lhs_lines =
- SplitEscapedString(lhs_value);
- const std::vector<std::string> rhs_lines =
- SplitEscapedString(rhs_value);
+ const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+ const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
msg << "\nWith diff:\n"
<< edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression,
// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
std::string GetBoolAssertionFailureMessage(
- const AssertionResult& assertion_result,
- const char* expression_text,
- const char* actual_predicate_value,
- const char* expected_predicate_value) {
+ const AssertionResult& assertion_result, const char* expression_text,
+ const char* actual_predicate_value, const char* expected_predicate_value) {
const char* actual_message = assertion_result.message();
Message msg;
msg << "Value of: " << expression_text
<< "\n Actual: " << actual_predicate_value;
- if (actual_message[0] != '\0')
- msg << " (" << actual_message << ")";
+ if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
msg << "\nExpected: " << expected_predicate_value;
return msg.GetString();
}
// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
- const char* expr2,
- const char* abs_error_expr,
- double val1,
- double val2,
- double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+ const char* abs_error_expr, double val1,
+ double val2, double abs_error) {
const double diff = fabs(val1 - val2);
if (diff <= abs_error) return AssertionSuccess();
@@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
"EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
}
return AssertionFailure()
- << "The difference between " << expr1 << " and " << expr2
- << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
- << expr1 << " evaluates to " << val1 << ",\n"
- << expr2 << " evaluates to " << val2 << ", and\n"
- << abs_error_expr << " evaluates to " << abs_error << ".";
+ << "The difference between " << expr1 << " and " << expr2 << " is "
+ << diff << ", which exceeds " << abs_error_expr << ", where\n"
+ << expr1 << " evaluates to " << val1 << ",\n"
+ << expr2 << " evaluates to " << val2 << ", and\n"
+ << abs_error_expr << " evaluates to " << abs_error << ".";
}
-
// Helper template for implementing FloatLE() and DoubleLE().
template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
- const char* expr2,
- RawType val1,
- RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+ RawType val1, RawType val2) {
// Returns success if val1 is less than val2,
if (val1 < val2) {
return AssertionSuccess();
@@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1,
<< val2;
return AssertionFailure()
- << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
- << " Actual: " << StringStreamToString(&val1_ss) << " vs "
- << StringStreamToString(&val2_ss);
+ << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+ << " Actual: " << StringStreamToString(&val1_ss) << " vs "
+ << StringStreamToString(&val2_ss);
}
} // namespace internal
// Asserts that val1 is less than, or almost equal to, val2. Fails
// otherwise. In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
- float val1, float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+ float val2) {
return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
}
// Asserts that val1 is less than, or almost equal to, val2. Fails
// otherwise. In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
- double val1, double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+ double val2) {
return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
}
@@ -1658,62 +1685,51 @@ namespace internal {
// The helper function for {ASSERT|EXPECT}_STREQ.
AssertionResult CmpHelperSTREQ(const char* lhs_expression,
- const char* rhs_expression,
- const char* lhs,
+ const char* rhs_expression, const char* lhs,
const char* rhs) {
if (String::CStringEquals(lhs, rhs)) {
return AssertionSuccess();
}
- return EqFailure(lhs_expression,
- rhs_expression,
- PrintToString(lhs),
- PrintToString(rhs),
- false);
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), false);
}
// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
- const char* rhs_expression,
- const char* lhs,
+ const char* rhs_expression, const char* lhs,
const char* rhs) {
if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
return AssertionSuccess();
}
- return EqFailure(lhs_expression,
- rhs_expression,
- PrintToString(lhs),
- PrintToString(rhs),
- true);
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), true);
}
// The helper function for {ASSERT|EXPECT}_STRNE.
AssertionResult CmpHelperSTRNE(const char* s1_expression,
- const char* s2_expression,
- const char* s1,
+ const char* s2_expression, const char* s1,
const char* s2) {
if (!String::CStringEquals(s1, s2)) {
return AssertionSuccess();
} else {
- return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
- << s2_expression << "), actual: \""
- << s1 << "\" vs \"" << s2 << "\"";
+ return AssertionFailure()
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
}
}
// The helper function for {ASSERT|EXPECT}_STRCASENE.
AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
- const char* s2_expression,
- const char* s1,
+ const char* s2_expression, const char* s1,
const char* s2) {
if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
return AssertionSuccess();
} else {
return AssertionFailure()
- << "Expected: (" << s1_expression << ") != ("
- << s2_expression << ") (ignoring case), actual: \""
- << s1 << "\" vs \"" << s2 << "\"";
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
}
}
@@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
// StringType here can be either ::std::string or ::std::wstring.
template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
- const StringType& haystack) {
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
return haystack.find(needle) != StringType::npos;
}
@@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle,
// StringType here can be const char*, const wchar_t*, ::std::string,
// or ::std::wstring.
template <typename StringType>
-AssertionResult IsSubstringImpl(
- bool expected_to_be_substring,
- const char* needle_expr, const char* haystack_expr,
- const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+ const char* needle_expr,
+ const char* haystack_expr,
+ const StringType& needle,
+ const StringType& haystack) {
if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
return AssertionSuccess();
const bool is_wide_string = sizeof(needle[0]) > 1;
const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
return AssertionFailure()
- << "Value of: " << needle_expr << "\n"
- << " Actual: " << begin_string_quote << needle << "\"\n"
- << "Expected: " << (expected_to_be_substring ? "" : "not ")
- << "a substring of " << haystack_expr << "\n"
- << "Which is: " << begin_string_quote << haystack << "\"";
+ << "Value of: " << needle_expr << "\n"
+ << " Actual: " << begin_string_quote << needle << "\"\n"
+ << "Expected: " << (expected_to_be_substring ? "" : "not ")
+ << "a substring of " << haystack_expr << "\n"
+ << "Which is: " << begin_string_quote << haystack << "\"";
}
} // namespace
@@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl(
// substring of haystack (NULL is considered a substring of itself
// only), and return an appropriate error message when they fail.
-AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const char* needle, const char* haystack) {
return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const wchar_t* needle, const wchar_t* haystack) {
return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr, const char* needle,
+ const char* haystack) {
return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr, const wchar_t* needle,
+ const wchar_t* haystack) {
return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack) {
return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack) {
return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
}
#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack) {
return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
}
-AssertionResult IsNotSubstring(
- const char* needle_expr, const char* haystack_expr,
- const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack) {
return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
}
#endif // GTEST_HAS_STD_WSTRING
@@ -1831,43 +1847,42 @@ namespace internal {
namespace {
// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
- const char* expected,
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
long hr) { // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
// Windows CE doesn't support FormatMessage.
const char error_text[] = "";
-# else
+#else
// Looks up the human-readable system message for the HRESULT code
// and since we're not passing any params to FormatMessage, we don't
// want inserts expanded.
- const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
- FORMAT_MESSAGE_IGNORE_INSERTS;
+ const DWORD kFlags =
+ FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
const DWORD kBufSize = 4096;
// Gets the system's human readable message string for this HRESULT.
- char error_text[kBufSize] = { '\0' };
+ char error_text[kBufSize] = {'\0'};
DWORD message_length = ::FormatMessageA(kFlags,
- 0, // no source, we're asking system
+ 0, // no source, we're asking system
static_cast<DWORD>(hr), // the error
- 0, // no line width restrictions
+ 0, // no line width restrictions
error_text, // output buffer
kBufSize, // buf size
nullptr); // no arguments for inserts
// Trims tailing white space (FormatMessage leaves a trailing CR-LF)
for (; message_length && IsSpace(error_text[message_length - 1]);
- --message_length) {
+ --message_length) {
error_text[message_length - 1] = '\0';
}
-# endif // GTEST_OS_WINDOWS_MOBILE
+#endif // GTEST_OS_WINDOWS_MOBILE
const std::string error_hex("0x" + String::FormatHexInt(hr));
return ::testing::AssertionFailure()
- << "Expected: " << expr << " " << expected << ".\n"
- << " Actual: " << error_hex << " " << error_text << "\n";
+ << "Expected: " << expr << " " << expected << ".\n"
+ << " Actual: " << error_hex << " " << error_text << "\n";
}
} // namespace
@@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT
// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// The maximum code-point a one-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
// The maximum code-point a two-byte UTF-8 sequence can represent.
constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
// The maximum code-point a three-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+ (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
// The maximum code-point a four-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+ (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
// Chops off the n lowest bits from a bit pattern. Returns the n
// lowest bits. As a side effect, the original bit pattern will be
@@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) {
char str[5]; // Big enough for the largest valid code point.
if (code_point <= kMaxCodePoint1) {
str[1] = '\0';
- str[0] = static_cast<char>(code_point); // 0xxxxxxx
+ str[0] = static_cast<char>(code_point); // 0xxxxxxx
} else if (code_point <= kMaxCodePoint2) {
str[2] = '\0';
str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
@@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) {
// and thus should be combined into a single Unicode code point
// using CreateCodePointFromUtf16SurrogatePair.
inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
- return sizeof(wchar_t) == 2 &&
- (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+ return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+ (second & 0xFC00) == 0xDC00;
}
// Creates a Unicode code point from UTF16 surrogate pair.
@@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
// and contains invalid UTF-16 surrogate pairs, values in those pairs
// will be encoded as individual Unicode characters from Basic Normal Plane.
std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
- if (num_chars == -1)
- num_chars = static_cast<int>(wcslen(str));
+ if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
::std::stringstream stream;
for (int i = 0; i < num_chars; ++i) {
@@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
if (str[i] == L'\0') {
break;
} else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
- unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
- str[i + 1]);
+ unicode_code_point =
+ CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
i++;
} else {
unicode_code_point = static_cast<uint32_t>(str[i]);
@@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
// Converts a wide C string to an std::string using the UTF-8 encoding.
// NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
if (wide_c_str == nullptr) return "(null)";
return internal::WideStringToUtf8(wide_c_str, -1);
@@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) {
// Unlike wcscmp(), this function can handle NULL argument(s). A NULL
// C string is considered different to any non-NULL C string,
// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
if (lhs == nullptr) return rhs == nullptr;
if (rhs == nullptr) return false;
@@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
// Helper function for *_STREQ on wide strings.
AssertionResult CmpHelperSTREQ(const char* lhs_expression,
- const char* rhs_expression,
- const wchar_t* lhs,
+ const char* rhs_expression, const wchar_t* lhs,
const wchar_t* rhs) {
if (String::WideCStringEquals(lhs, rhs)) {
return AssertionSuccess();
}
- return EqFailure(lhs_expression,
- rhs_expression,
- PrintToString(lhs),
- PrintToString(rhs),
- false);
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), false);
}
// Helper function for *_STRNE on wide strings.
AssertionResult CmpHelperSTRNE(const char* s1_expression,
- const char* s2_expression,
- const wchar_t* s1,
+ const char* s2_expression, const wchar_t* s1,
const wchar_t* s2) {
if (!String::WideCStringEquals(s1, s2)) {
return AssertionSuccess();
}
- return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
- << s2_expression << "), actual: "
- << PrintToString(s1)
- << " vs " << PrintToString(s2);
+ return AssertionFailure()
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
}
// Compares two C strings, ignoring case. Returns true if and only if they have
@@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
// Unlike strcasecmp(), this function can handle NULL argument(s). A
// NULL C string is considered different to any non-NULL C string,
// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
if (lhs == nullptr) return rhs == nullptr;
if (rhs == nullptr) return false;
return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
// Returns true if and only if str ends with the given suffix, ignoring case.
// Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
- const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string& str,
+ const std::string& suffix) {
const size_t str_len = str.length();
const size_t suffix_len = suffix.length();
return (str_len >= suffix_len) &&
@@ -2202,15 +2212,13 @@ TestResult::TestResult()
: death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
// D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
// Returns the i-th test part result among all the results. i can
// range from 0 to total_part_count() - 1. If i is not in that range,
// aborts the program.
const TestPartResult& TestResult::GetTestPartResult(int i) const {
- if (i < 0 || i >= total_part_count())
- internal::posix::Abort();
+ if (i < 0 || i >= total_part_count()) internal::posix::Abort();
return test_part_results_.at(static_cast<size_t>(i));
}
@@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const {
// test_property_count() - 1. If i is not in that range, aborts the
// program.
const TestProperty& TestResult::GetTestProperty(int i) const {
- if (i < 0 || i >= test_property_count())
- internal::posix::Abort();
+ if (i < 0 || i >= test_property_count()) internal::posix::Abort();
return test_properties_.at(static_cast<size_t>(i));
}
// Clears the test part results.
-void TestResult::ClearTestPartResults() {
- test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
// Adds a test part result to the list.
void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
@@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element,
// The list of reserved attributes used in the <testsuites> element of XML
// output.
static const char* const kReservedTestSuitesAttributes[] = {
- "disabled",
- "errors",
- "failures",
- "name",
- "random_seed",
- "tests",
- "time",
- "timestamp"
-};
+ "disabled", "errors", "failures", "name",
+ "random_seed", "tests", "time", "timestamp"};
// The list of reserved attributes used in the <testsuite> element of XML
// output.
@@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
// The list of reserved attributes used in the <testcase> element of XML output.
static const char* const kReservedTestCaseAttributes[] = {
- "classname", "name", "status", "time", "type_param",
- "value_param", "file", "line"};
+ "classname", "name", "status", "time",
+ "type_param", "value_param", "file", "line"};
// Use a slightly different set for allowed output to ensure existing tests can
// still RecordProperty("result") or "RecordProperty(timestamp")
@@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName(
const std::string& property_name,
const std::vector<std::string>& reserved_names) {
if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
- reserved_names.end()) {
+ reserved_names.end()) {
ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
<< " (" << FormatWordList(reserved_names)
<< " are reserved by " << GTEST_NAME_ << ")";
@@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const {
// Returns true if and only if the test failed.
bool TestResult::Failed() const {
for (int i = 0; i < total_part_count(); ++i) {
- if (GetTestPartResult(i).failed())
- return true;
+ if (GetTestPartResult(i).failed()) return true;
}
return false;
}
@@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const {
// Creates a Test object.
// The c'tor saves the states of all flags.
-Test::Test()
- : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
// The d'tor restores the states of all flags. The actual work is
// done by the d'tor of the gtest_flag_saver_ field, and thus not
// visible here.
-Test::~Test() {
-}
+Test::~Test() {}
// Sets up the test fixture.
//
// A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
// Tears down the test fixture.
//
// A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
// Allows user supplied key value pairs to be recorded for later output.
void Test::RecordProperty(const std::string& key, const std::string& value) {
@@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() {
static std::string* FormatSehExceptionMessage(DWORD exception_code,
const char* location) {
Message message;
- message << "SEH exception with code 0x" << std::setbase(16) <<
- exception_code << std::setbase(10) << " thrown in " << location << ".";
+ message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+ << std::setbase(10) << " thrown in " << location << ".";
return new std::string(message.GetString());
}
@@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException(
// exceptions in the same function. Therefore, we provide a separate
// wrapper function for handling SEH exceptions.)
template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
- T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+ const char* location) {
#if GTEST_HAS_SEH
__try {
return (object->*method)();
@@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported(
// We create the exception message on the heap because VC++ prohibits
// creation of objects with destructors on stack in functions using __try
// (see error C2712).
- std::string* exception_message = FormatSehExceptionMessage(
- GetExceptionCode(), location);
+ std::string* exception_message =
+ FormatSehExceptionMessage(GetExceptionCode(), location);
internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
*exception_message);
delete exception_message;
@@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported(
// exceptions, if they are supported; returns the 0-value for type
// Result in case of an SEH exception.
template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
- T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+ const char* location) {
// NOTE: The user code can affect the way in which Google Test handles
// exceptions by setting GTEST_FLAG(catch_exceptions), but only before
// RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported(
// try {
// // Perform the test method.
// } catch (...) {
- // if (GTEST_FLAG(catch_exceptions))
+ // if (GTEST_FLAG_GET(catch_exceptions))
// // Report the exception as failure.
// else
// throw; // Re-throws the original exception.
@@ -2679,16 +2671,16 @@ void Test::Run() {
// GTEST_SKIP().
if (!HasFatalFailure() && !IsSkipped()) {
impl->os_stack_trace_getter()->UponLeavingGTest();
- internal::HandleExceptionsInMethodIfSupported(
- this, &Test::TestBody, "the test body");
+ internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+ "the test body");
}
// However, we want to clean up as much as possible. Hence we will
// always call TearDown(), even if SetUp() or the test body has
// failed.
impl->os_stack_trace_getter()->UponLeavingGTest();
- internal::HandleExceptionsInMethodIfSupported(
- this, &Test::TearDown, "TearDown()");
+ internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+ "TearDown()");
}
// Returns true if and only if the current test has a fatal failure.
@@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() {
// Returns true if and only if the current test has a non-fatal failure.
bool Test::HasNonfatalFailure() {
- return internal::GetUnitTestImpl()->current_test_result()->
- HasNonfatalFailure();
+ return internal::GetUnitTestImpl()
+ ->current_test_result()
+ ->HasNonfatalFailure();
}
// Returns true if and only if the current test was skipped.
@@ -2799,11 +2792,10 @@ class TestNameIs {
// Constructor.
//
// TestNameIs has NO default constructor.
- explicit TestNameIs(const char* name)
- : name_(name) {}
+ explicit TestNameIs(const char* name) : name_(name) {}
// Returns true if and only if the test name of test_info matches name_.
- bool operator()(const TestInfo * test_info) const {
+ bool operator()(const TestInfo* test_info) const {
return test_info && test_info->name() == name_;
}
@@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() {
// Creates the test object, runs it, records its result, and then
// deletes it.
void TestInfo::Run() {
- if (!should_run_) return;
+ TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+ if (!should_run_) {
+ if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+ return;
+ }
// Tells UnitTest where to store test result.
internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
impl->set_current_test_info(this);
- TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
// Notifies the unit test event listeners that a test is about to start.
repeater->OnTestStart(*this);
-
result_.set_start_timestamp(internal::GetTimeInMillis());
internal::Timer timer;
-
impl->os_stack_trace_getter()->UponLeavingGTest();
// Creates the test object.
@@ -3009,11 +3001,18 @@ void TestSuite::Run() {
internal::HandleExceptionsInMethodIfSupported(
this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
+ const bool skip_all = ad_hoc_test_result().Failed();
+
start_timestamp_ = internal::GetTimeInMillis();
internal::Timer timer;
for (int i = 0; i < total_test_count(); i++) {
- GetMutableTestInfo(i)->Run();
- if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+ if (skip_all) {
+ GetMutableTestInfo(i)->Skip();
+ } else {
+ GetMutableTestInfo(i)->Run();
+ }
+ if (GTEST_FLAG_GET(fail_fast) &&
+ GetMutableTestInfo(i)->result()->Failed()) {
for (int j = i + 1; j < total_test_count(); j++) {
GetMutableTestInfo(j)->Skip();
}
@@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() {
//
// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
// FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
- const char * singular_form,
- const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char* singular_form,
+ const char* plural_form) {
return internal::StreamableToString(count) + " " +
- (count == 1 ? singular_form : plural_form);
+ (count == 1 ? singular_form : plural_form);
}
// Formats the count of tests.
@@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
// representation. Both kNonFatalFailure and kFatalFailure are translated
// to "Failure", as the user usually doesn't care about the difference
// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
switch (type) {
case TestPartResult::kSkip:
return "Skipped\n";
@@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow };
// Prints a TestPartResult to an std::string.
static std::string PrintTestPartResultToString(
const TestPartResult& test_part_result) {
- return (Message()
- << internal::FormatFileLocation(test_part_result.file_name(),
- test_part_result.line_number())
- << " " << TestPartResultTypeToString(test_part_result.type())
- << test_part_result.message()).GetString();
+ return (Message() << internal::FormatFileLocation(
+ test_part_result.file_name(),
+ test_part_result.line_number())
+ << " "
+ << TestPartResultTypeToString(test_part_result.type())
+ << test_part_result.message())
+ .GetString();
}
// Prints a TestPartResult.
static void PrintTestPartResult(const TestPartResult& test_part_result) {
- const std::string& result =
- PrintTestPartResultToString(test_part_result);
+ const std::string& result = PrintTestPartResultToString(test_part_result);
printf("%s\n", result.c_str());
fflush(stdout);
// If the test program runs in Visual Studio or a debugger, the
@@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
}
// class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
- !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+ !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
// Returns the character attribute for the given color.
static WORD GetColorAttribute(GTestColor color) {
@@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) {
return FOREGROUND_GREEN;
case GTestColor::kYellow:
return FOREGROUND_RED | FOREGROUND_GREEN;
- default: return 0;
+ default:
+ return 0;
}
}
@@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) {
// Returns true if and only if Google Test should use colors in the output.
bool ShouldUseColor(bool stdout_is_tty) {
- const char* const gtest_color = GTEST_FLAG(color).c_str();
+ std::string c = GTEST_FLAG_GET(color);
+ const char* const gtest_color = c.c_str();
if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
}
return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
- String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
- String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
- String::CStringEquals(gtest_color, "1");
+ String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+ String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+ String::CStringEquals(gtest_color, "1");
// We take "yes", "true", "t", and "1" as meaning "yes". If the
// value is neither one of these nor "auto", we treat it as "no" to
// be conservative.
@@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) {
// that would be colored when printed, as can be done on Linux.
GTEST_ATTRIBUTE_PRINTF_(2, 3)
-static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
va_list args;
va_start(args, fmt);
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
- GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
- const bool use_color = AlwaysFalse();
-#else
static const bool in_color_mode =
ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
const bool use_color = in_color_mode && (color != GTestColor::kDefault);
-#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
if (!use_color) {
vprintf(fmt, args);
@@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
return;
}
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
- !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+ !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
// Gets the current text color.
@@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
#endif // OnTestCaseStart
void OnTestStart(const TestInfo& test_info) override;
+ void OnTestDisabled(const TestInfo& test_info) override;
void OnTestPartResult(const TestPartResult& result) override;
void OnTestEnd(const TestInfo& test_info) override;
@@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
static void PrintSkippedTests(const UnitTest& unit_test);
};
- // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
void PrettyUnitTestResultPrinter::OnTestIterationStart(
const UnitTest& unit_test, int iteration) {
- if (GTEST_FLAG(repeat) != 1)
+ if (GTEST_FLAG_GET(repeat) != 1)
printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
- const char* const filter = GTEST_FLAG(filter).c_str();
+ std::string f = GTEST_FLAG_GET(filter);
+ const char* const filter = f.c_str();
// Prints the filter if it's not *. This reminds the user that some
// tests may be skipped.
@@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
internal::posix::GetEnv(kTestTotalShards));
}
- if (GTEST_FLAG(shuffle)) {
+ if (GTEST_FLAG_GET(shuffle)) {
ColoredPrintf(GTestColor::kYellow,
"Note: Randomizing tests' orders with a seed of %d .\n",
unit_test.random_seed());
@@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
fflush(stdout);
}
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+ ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+ PrintTestName(test_info.test_suite_name(), test_info.name());
+ printf("\n");
+ fflush(stdout);
+}
+
// Called after an assertion failure.
void PrettyUnitTestResultPrinter::OnTestPartResult(
const TestPartResult& result) {
@@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
}
PrintTestName(test_info.test_suite_name(), test_info.name());
- if (test_info.result()->Failed())
- PrintFullTestCommentIfPresent(test_info);
+ if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
- if (GTEST_FLAG(print_time)) {
- printf(" (%s ms)\n", internal::StreamableToString(
- test_info.result()->elapsed_time()).c_str());
+ if (GTEST_FLAG_GET(print_time)) {
+ printf(" (%s ms)\n",
+ internal::StreamableToString(test_info.result()->elapsed_time())
+ .c_str());
} else {
printf("\n");
}
@@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
- if (!GTEST_FLAG(print_time)) return;
+ if (!GTEST_FLAG_GET(print_time)) return;
const std::string counts =
FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
@@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
}
#else
void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
- if (!GTEST_FLAG(print_time)) return;
+ if (!GTEST_FLAG_GET(print_time)) return;
const std::string counts =
FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
@@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
printf("%s from %s ran.",
FormatTestCount(unit_test.test_to_run_count()).c_str(),
FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
- if (GTEST_FLAG(print_time)) {
+ if (GTEST_FLAG_GET(print_time)) {
printf(" (%s ms total)",
internal::StreamableToString(unit_test.elapsed_time()).c_str());
}
@@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
}
int num_disabled = unit_test.reportable_disabled_test_count();
- if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+ if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
if (unit_test.Passed()) {
printf("\n"); // Add a spacer if no FAILURE banner is displayed.
}
@@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener {
#endif // OnTestCaseStart
void OnTestStart(const TestInfo& /*test_info*/) override {}
+ void OnTestDisabled(const TestInfo& /*test_info*/) override {}
void OnTestPartResult(const TestPartResult& result) override;
void OnTestEnd(const TestInfo& test_info) override;
@@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
PrintTestName(test_info.test_suite_name(), test_info.name());
PrintFullTestCommentIfPresent(test_info);
- if (GTEST_FLAG(print_time)) {
+ if (GTEST_FLAG_GET(print_time)) {
printf(" (%s ms)\n",
internal::StreamableToString(test_info.result()->elapsed_time())
.c_str());
@@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
printf("%s from %s ran.",
FormatTestCount(unit_test.test_to_run_count()).c_str(),
FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
- if (GTEST_FLAG(print_time)) {
+ if (GTEST_FLAG_GET(print_time)) {
printf(" (%s ms total)",
internal::StreamableToString(unit_test.elapsed_time()).c_str());
}
@@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
}
int num_disabled = unit_test.reportable_disabled_test_count();
- if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+ if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
if (unit_test.Passed()) {
printf("\n"); // Add a spacer if no FAILURE banner is displayed.
}
@@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener {
public:
TestEventRepeater() : forwarding_enabled_(true) {}
~TestEventRepeater() override;
- void Append(TestEventListener *listener);
+ void Append(TestEventListener* listener);
TestEventListener* Release(TestEventListener* listener);
// Controls whether events will be forwarded to listeners_. Set to false
@@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener {
#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
void OnTestSuiteStart(const TestSuite& parameter) override;
void OnTestStart(const TestInfo& test_info) override;
+ void OnTestDisabled(const TestInfo& test_info) override;
void OnTestPartResult(const TestPartResult& result) override;
void OnTestEnd(const TestInfo& test_info) override;
// Legacy API is deprecated but still available
@@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener {
// The list of listeners that receive events.
std::vector<TestEventListener*> listeners_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+ TestEventRepeater(const TestEventRepeater&) = delete;
+ TestEventRepeater& operator=(const TestEventRepeater&) = delete;
};
TestEventRepeater::~TestEventRepeater() {
ForEach(listeners_, Delete<TestEventListener>);
}
-void TestEventRepeater::Append(TestEventListener *listener) {
+void TestEventRepeater::Append(TestEventListener* listener) {
listeners_.push_back(listener);
}
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
for (size_t i = 0; i < listeners_.size(); ++i) {
if (listeners_[i] == listener) {
listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
// Since most methods are very similar, use macros to reduce boilerplate.
// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
- if (forwarding_enabled_) { \
- for (size_t i = 0; i < listeners_.size(); i++) { \
- listeners_[i]->Name(parameter); \
- } \
- } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+ void TestEventRepeater::Name(const Type& parameter) { \
+ if (forwarding_enabled_) { \
+ for (size_t i = 0; i < listeners_.size(); i++) { \
+ listeners_[i]->Name(parameter); \
+ } \
+ } \
+ }
// This defines a member that forwards the call to all listeners in reverse
// order.
#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
@@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
@@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
private:
// Is c a whitespace character that is normalized to a space character
// when it appears in an XML attribute value?
- static bool IsNormalizableWhitespace(char c) {
- return c == 0x9 || c == 0xA || c == 0xD;
+ static bool IsNormalizableWhitespace(unsigned char c) {
+ return c == '\t' || c == '\n' || c == '\r';
}
// May c appear in a well-formed XML document?
- static bool IsValidXmlCharacter(char c) {
+ // https://www.w3.org/TR/REC-xml/#charsets
+ static bool IsValidXmlCharacter(unsigned char c) {
return IsNormalizableWhitespace(c) || c >= 0x20;
}
@@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
// The output file.
const std::string output_file_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+ XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+ XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
};
// Creates a new XmlUnitTestResultPrinter.
@@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
// module will consist of ordinary English text.
// If this module is ever modified to produce version 1.1 XML output,
// most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
- const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+ bool is_attribute) {
Message m;
for (size_t i = 0; i < str.size(); ++i) {
@@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml(
m << '"';
break;
default:
- if (IsValidXmlCharacter(ch)) {
- if (is_attribute && IsNormalizableWhitespace(ch))
+ if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+ if (is_attribute &&
+ IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
<< ";";
else
@@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
std::string output;
output.reserve(str.size());
for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
- if (IsValidXmlCharacter(*it))
+ if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
output.push_back(*it);
return output;
@@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
// The following routines generate an XML representation of a UnitTest
// object.
-// GOOGLETEST_CM0009 DO NOT DELETE
//
// This is how Google Test concepts map to the DTD:
//
@@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
return "";
// YYYY-MM-DDThh:mm:ss.sss
return StreamableToString(time_struct.tm_year + 1900) + "-" +
- String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
- String::FormatIntWidth2(time_struct.tm_mday) + "T" +
- String::FormatIntWidth2(time_struct.tm_hour) + ":" +
- String::FormatIntWidth2(time_struct.tm_min) + ":" +
- String::FormatIntWidth2(time_struct.tm_sec) + "." +
- String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+ String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+ String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+ String::FormatIntWidth2(time_struct.tm_min) + ":" +
+ String::FormatIntWidth2(time_struct.tm_sec) + "." +
+ String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
}
// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
for (;;) {
const char* const next_segment = strstr(segment, "]]>");
if (next_segment != nullptr) {
- stream->write(
- segment, static_cast<std::streamsize>(next_segment - segment));
+ stream->write(segment,
+ static_cast<std::streamsize>(next_segment - segment));
*stream << "]]>]]&gt;<![CDATA[";
segment = next_segment + strlen("]]>");
} else {
@@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
}
void XmlUnitTestResultPrinter::OutputXmlAttribute(
- std::ostream* stream,
- const std::string& element_name,
- const std::string& name,
- const std::string& value) {
+ std::ostream* stream, const std::string& element_name,
+ const std::string& name, const std::string& value) {
const std::vector<std::string>& allowed_names =
GetReservedOutputAttributesForElement(element_name);
GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
- allowed_names.end())
+ allowed_names.end())
<< "Attribute " << name << " is not allowed for element <" << element_name
<< ">.";
@@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
OutputXmlAttribute(stream, kTestsuite, "type_param",
test_info.type_param());
}
- if (GTEST_FLAG(list_tests)) {
- OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
- OutputXmlAttribute(stream, kTestsuite, "line",
- StreamableToString(test_info.line()));
+
+ OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+ OutputXmlAttribute(stream, kTestsuite, "line",
+ StreamableToString(test_info.line()));
+ if (GTEST_FLAG_GET(list_tests)) {
*stream << " />\n";
return;
}
@@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
internal::FormatCompilerIndependentFileLocation(part.file_name(),
part.line_number());
const std::string summary = location + "\n" + part.summary();
- *stream << " <failure message=\""
- << EscapeXmlAttribute(summary)
+ *stream << " <failure message=\"" << EscapeXmlAttribute(summary)
<< "\" type=\"\">";
const std::string detail = location + "\n" + part.message();
OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
@@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
OutputXmlAttribute(stream, kTestsuite, "tests",
StreamableToString(test_suite.reportable_test_count()));
- if (!GTEST_FLAG(list_tests)) {
+ if (!GTEST_FLAG_GET(list_tests)) {
OutputXmlAttribute(stream, kTestsuite, "failures",
StreamableToString(test_suite.failed_test_count()));
OutputXmlAttribute(
@@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
stream, kTestsuites, "timestamp",
FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
- if (GTEST_FLAG(shuffle)) {
+ if (GTEST_FLAG_GET(shuffle)) {
OutputXmlAttribute(stream, kTestsuites, "random_seed",
StreamableToString(unit_test.random_seed()));
}
@@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
for (int i = 0; i < result.test_property_count(); ++i) {
const TestProperty& property = result.GetTestProperty(i);
attributes << " " << property.key() << "="
- << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+ << "\"" << EscapeXmlAttribute(property.value()) << "\"";
}
return attributes.GetString();
}
@@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
return;
}
- *stream << "<" << kProperties << ">\n";
+ *stream << " <" << kProperties << ">\n";
for (int i = 0; i < result.test_property_count(); ++i) {
const TestProperty& property = result.GetTestProperty(i);
- *stream << "<" << kProperty;
+ *stream << " <" << kProperty;
*stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
*stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
*stream << "/>\n";
}
- *stream << "</" << kProperties << ">\n";
+ *stream << " </" << kProperties << ">\n";
}
// End XmlUnitTestResultPrinter
@@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
//// streams the attribute as JSON.
static void OutputJsonKey(std::ostream* stream,
const std::string& element_name,
- const std::string& name,
- const std::string& value,
- const std::string& indent,
- bool comma = true);
+ const std::string& name, const std::string& value,
+ const std::string& indent, bool comma = true);
static void OutputJsonKey(std::ostream* stream,
const std::string& element_name,
- const std::string& name,
- int value,
- const std::string& indent,
- bool comma = true);
+ const std::string& name, int value,
+ const std::string& indent, bool comma = true);
// Streams a test suite JSON stanza containing the given test result.
//
@@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
// The output file.
const std::string output_file_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+ JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+ JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+ delete;
};
// Creates a new JsonUnitTestResultPrinter.
@@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
}
void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
- int /*iteration*/) {
+ int /*iteration*/) {
FILE* jsonout = OpenFileForWriting(output_file_);
std::stringstream stream;
PrintJsonUnitTest(&stream, unit_test);
@@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
return "";
// YYYY-MM-DDThh:mm:ss
return StreamableToString(time_struct.tm_year + 1900) + "-" +
- String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
- String::FormatIntWidth2(time_struct.tm_mday) + "T" +
- String::FormatIntWidth2(time_struct.tm_hour) + ":" +
- String::FormatIntWidth2(time_struct.tm_min) + ":" +
- String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+ String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+ String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+ String::FormatIntWidth2(time_struct.tm_min) + ":" +
+ String::FormatIntWidth2(time_struct.tm_sec) + "Z";
}
static inline std::string Indent(size_t width) {
return std::string(width, ' ');
}
-void JsonUnitTestResultPrinter::OutputJsonKey(
- std::ostream* stream,
- const std::string& element_name,
- const std::string& name,
- const std::string& value,
- const std::string& indent,
- bool comma) {
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+ const std::string& element_name,
+ const std::string& name,
+ const std::string& value,
+ const std::string& indent,
+ bool comma) {
const std::vector<std::string>& allowed_names =
GetReservedOutputAttributesForElement(element_name);
GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
- allowed_names.end())
+ allowed_names.end())
<< "Key \"" << name << "\" is not allowed for value \"" << element_name
<< "\".";
*stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
- if (comma)
- *stream << ",\n";
+ if (comma) *stream << ",\n";
}
void JsonUnitTestResultPrinter::OutputJsonKey(
- std::ostream* stream,
- const std::string& element_name,
- const std::string& name,
- int value,
- const std::string& indent,
- bool comma) {
+ std::ostream* stream, const std::string& element_name,
+ const std::string& name, int value, const std::string& indent, bool comma) {
const std::vector<std::string>& allowed_names =
GetReservedOutputAttributesForElement(element_name);
GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
- allowed_names.end())
+ allowed_names.end())
<< "Key \"" << name << "\" is not allowed for value \"" << element_name
<< "\".";
*stream << indent << "\"" << name << "\": " << StreamableToString(value);
- if (comma)
- *stream << ",\n";
+ if (comma) *stream << ",\n";
}
// Streams a test suite JSON stanza containing the given test result.
@@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
*stream << Indent(4) << "{\n";
OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
- if (!GTEST_FLAG(list_tests)) {
+ if (!GTEST_FLAG_GET(list_tests)) {
OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
@@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
kIndent);
}
- if (GTEST_FLAG(list_tests)) {
- OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
- OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+
+ OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+ OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+ if (GTEST_FLAG_GET(list_tests)) {
*stream << "\n" << Indent(8) << "}";
return;
+ } else {
+ *stream << ",\n";
}
OutputJsonKey(stream, kTestsuite, "status",
@@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
if (part.failed()) {
*stream << ",\n";
if (++failures == 1) {
- *stream << kIndent << "\"" << "failures" << "\": [\n";
+ *stream << kIndent << "\""
+ << "failures"
+ << "\": [\n";
}
const std::string location =
internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
}
}
- if (failures > 0)
- *stream << "\n" << kIndent << "]";
+ if (failures > 0) *stream << "\n" << kIndent << "]";
*stream << "\n" << Indent(8) << "}";
}
@@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
kIndent);
- if (!GTEST_FLAG(list_tests)) {
+ if (!GTEST_FLAG_GET(list_tests)) {
OutputJsonKey(stream, kTestsuite, "failures",
test_suite.failed_test_count(), kIndent);
OutputJsonKey(stream, kTestsuite, "disabled",
@@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
OutputJsonKey(stream, kTestsuites, "disabled",
unit_test.reportable_disabled_test_count(), kIndent);
OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
- if (GTEST_FLAG(shuffle)) {
+ if (GTEST_FLAG_GET(shuffle)) {
OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
kIndent);
}
@@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
}
- *stream << "\n" << kIndent << "]\n" << "}\n";
+ *stream << "\n"
+ << kIndent << "]\n"
+ << "}\n";
}
void JsonUnitTestResultPrinter::PrintJsonTestList(
@@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
Message attributes;
for (int i = 0; i < result.test_property_count(); ++i) {
const TestProperty& property = result.GetTestProperty(i);
- attributes << ",\n" << indent << "\"" << property.key() << "\": "
+ attributes << ",\n"
+ << indent << "\"" << property.key() << "\": "
<< "\"" << EscapeJson(property.value()) << "\"";
}
return attributes.GetString();
@@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() {
addrinfo hints;
memset(&hints, 0, sizeof(hints));
- hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses.
+ hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses.
hints.ai_socktype = SOCK_STREAM;
addrinfo* servinfo = nullptr;
// Use the getaddrinfo() to get a linked list of IP addresses for
// the given host name.
- const int error_num = getaddrinfo(
- host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+ const int error_num =
+ getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
if (error_num != 0) {
GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
<< gai_strerror(error_num);
@@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() {
// Loop through all the results and connect to the first we can.
for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
cur_addr = cur_addr->ai_next) {
- sockfd_ = socket(
- cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+ sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+ cur_addr->ai_protocol);
if (sockfd_ != -1) {
// Connect the client socket to the server socket.
if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
for (int i = 0; i < raw_stack_size; ++i) {
if (raw_stack[i] == caller_frame &&
- !GTEST_FLAG(show_internal_stack_frames)) {
+ !GTEST_FLAG_GET(show_internal_stack_frames)) {
// Add a marker to the trace and stop adding frames.
absl::StrAppend(&result, kElidedFramesMarker, "\n");
break;
@@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
return result;
-#else // !GTEST_HAS_ABSL
+#else // !GTEST_HAS_ABSL
static_cast<void>(max_depth);
static_cast<void>(skip_count);
return "";
@@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
class ScopedPrematureExitFile {
public:
explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
- : premature_exit_filepath_(premature_exit_filepath ?
- premature_exit_filepath : "") {
+ : premature_exit_filepath_(
+ premature_exit_filepath ? premature_exit_filepath : "") {
// If a path to the premature-exit file is specified...
if (!premature_exit_filepath_.empty()) {
// create the file with a single "0" character in it. I/O
// errors are ignored as there's nothing better we can do and we
// don't want to fail the test because of this.
- FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+ FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
fwrite("0", 1, 1, pfile);
fclose(pfile);
}
@@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile {
private:
const std::string premature_exit_filepath_;
- GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+ ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+ ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
};
} // namespace internal
@@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
// Gets the time of the test program start, in ms from the start of the
// UNIX epoch.
internal::TimeInMillis UnitTest::start_timestamp() const {
- return impl()->start_timestamp();
+ return impl()->start_timestamp();
}
// Gets the elapsed time, in milliseconds.
@@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) {
// Returns the list of event listeners that can be used to track events
// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
- return *impl()->listeners();
-}
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
// Registers and returns a global test environment. When a test
// program is run, all global test environments will be set-up in the
@@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
// this to report their results. The user code should use the
// assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
- TestPartResult::Type result_type,
- const char* file_name,
- int line_number,
- const std::string& message,
- const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+ const char* file_name, int line_number,
+ const std::string& message,
+ const std::string& os_stack_trace)
+ GTEST_LOCK_EXCLUDED_(mutex_) {
Message msg;
msg << message;
@@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult(
for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
- msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
- << " " << trace.message;
+ msg << "\n"
+ << internal::FormatFileLocation(trace.file, trace.line) << " "
+ << trace.message;
}
}
@@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult(
const TestPartResult result = TestPartResult(
result_type, file_name, line_number, msg.GetString().c_str());
- impl_->GetTestPartResultReporterForCurrentThread()->
- ReportTestPartResult(result);
+ impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+ result);
if (result_type != TestPartResult::kSuccess &&
result_type != TestPartResult::kSkip) {
@@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult(
// in the code (perhaps in order to use Google Test assertions
// with another testing framework) and specify the former on the
// command line for debugging.
- if (GTEST_FLAG(break_on_failure)) {
+ if (GTEST_FLAG_GET(break_on_failure)) {
#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
// Using DebugBreak on Windows allows gtest to still break into a debugger
// when a failure happens and both the --gtest_break_on_failure and
@@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult(
// portability: some debuggers don't correctly trap abort().
*static_cast<volatile int*>(nullptr) = 1;
#endif // GTEST_OS_WINDOWS
- } else if (GTEST_FLAG(throw_on_failure)) {
+ } else if (GTEST_FLAG_GET(throw_on_failure)) {
#if GTEST_HAS_EXCEPTIONS
throw internal::GoogleTestFailureException(result);
#else
@@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key,
// from the main thread.
int UnitTest::Run() {
const bool in_death_test_child_process =
- internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+ GTEST_FLAG_GET(internal_run_death_test).length() > 0;
// Google Test implements this protocol for catching that a test
// program exits before returning control to Google Test:
@@ -5390,7 +5396,7 @@ int UnitTest::Run() {
// Captures the value of GTEST_FLAG(catch_exceptions). This value will be
// used for the duration of the program.
- impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+ impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
#if GTEST_OS_WINDOWS
// Either the user wants Google Test to catch exceptions thrown by the
@@ -5398,26 +5404,26 @@ int UnitTest::Run() {
// process. In either case the user does not want to see pop-up dialogs
// about crashes - they are expected.
if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
// SetErrorMode doesn't exist on CE.
SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif // !GTEST_OS_WINDOWS_MOBILE
+#endif // !GTEST_OS_WINDOWS_MOBILE
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
// Death test children can be terminated with _abort(). On Windows,
// _abort() can show a dialog with a warning message. This forces the
// abort message to go to stderr instead.
_set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
// In the debug version, Visual Studio pops up a separate dialog
// offering a choice to debug the aborted program. We need to suppress
// this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
// executed. Google Test will notify the user of any unexpected
// failure via stderr.
- if (!GTEST_FLAG(break_on_failure))
+ if (!GTEST_FLAG_GET(break_on_failure))
_set_abort_behavior(
0x0, // Clear the following flags:
_WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump.
@@ -5431,14 +5437,15 @@ int UnitTest::Run() {
_CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
(void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
}
-# endif
+#endif
}
#endif // GTEST_OS_WINDOWS
return internal::HandleExceptionsInMethodIfSupported(
- impl(),
- &internal::UnitTestImpl::RunAllTests,
- "auxiliary test code (environments or event listeners)") ? 0 : 1;
+ impl(), &internal::UnitTestImpl::RunAllTests,
+ "auxiliary test code (environments or event listeners)")
+ ? 0
+ : 1;
}
// Returns the working directory when the first TEST() or TEST_F() was
@@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
}
// Creates an empty UnitTest.
-UnitTest::UnitTest() {
- impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
// Destructor of UnitTest.
-UnitTest::~UnitTest() {
- delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
// Google Test trace stack.
@@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
}
// Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
- GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
internal::MutexLock lock(&mutex_);
impl_->gtest_trace_stack().pop_back();
}
@@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
// Initializes event listeners for streaming test results in string form.
// Must not be called before InitGoogleTest.
void UnitTestImpl::ConfigureStreamingOutput() {
- const std::string& target = GTEST_FLAG(stream_result_to);
+ const std::string& target = GTEST_FLAG_GET(stream_result_to);
if (!target.empty()) {
const size_t pos = target.find(':');
if (pos != std::string::npos) {
- listeners()->Append(new StreamingListener(target.substr(0, pos),
- target.substr(pos+1)));
+ listeners()->Append(
+ new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
} else {
GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
<< "\" ignored.";
@@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() {
// to shut down the default XML output before invoking RUN_ALL_TESTS.
ConfigureXmlOutput();
- if (GTEST_FLAG(brief)) {
+ if (GTEST_FLAG_GET(brief)) {
listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
}
@@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() {
#endif // GTEST_CAN_STREAM_RESULTS_
#if GTEST_HAS_ABSL
- if (GTEST_FLAG(install_failure_signal_handler)) {
+ if (GTEST_FLAG_GET(install_failure_signal_handler)) {
absl::FailureSignalHandlerOptions options;
absl::InstallFailureSignalHandler(options);
}
@@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite(
auto* const new_test_suite =
new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
+ const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
// Is this a death test suite?
- if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
- kDeathTestSuiteFilter)) {
+ if (death_test_suite_filter.MatchesName(test_suite_name)) {
// Yes. Inserts the test suite after the last death test suite
// defined so far. This only works when the test suites haven't
// been shuffled. Otherwise we may end up running a death test
@@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() {
const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
// Do not run any test if the --help flag was specified.
- if (g_help_flag)
- return true;
+ if (g_help_flag) return true;
// Repeats the call to the post-flag parsing initialization in case the
// user didn't call InitGoogleTest.
@@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() {
#if GTEST_HAS_DEATH_TEST
in_subprocess_for_death_test =
(internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
if (in_subprocess_for_death_test) {
GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
}
-# endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
#endif // GTEST_HAS_DEATH_TEST
const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() {
// Compares the full test names with the filter to decide which
// tests to run.
- const bool has_tests_to_run = FilterTests(should_shard
- ? HONOR_SHARDING_PROTOCOL
- : IGNORE_SHARDING_PROTOCOL) > 0;
+ const bool has_tests_to_run =
+ FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+ : IGNORE_SHARDING_PROTOCOL) > 0;
// Lists the tests and exits if the --gtest_list_tests flag was specified.
- if (GTEST_FLAG(list_tests)) {
+ if (GTEST_FLAG_GET(list_tests)) {
// This must be called *after* FilterTests() has been called.
ListTestsMatchingFilter();
return true;
}
- random_seed_ = GTEST_FLAG(shuffle) ?
- GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+ random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
// True if and only if at least one test has failed.
bool failed = false;
@@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() {
// How many times to repeat the tests? We don't want to repeat them
// when we are inside the subprocess of a death test.
- const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+ const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
// Repeats forever if the repeat count is negative.
const bool gtest_repeat_forever = repeat < 0;
+
+ // Should test environments be set up and torn down for each repeat, or only
+ // set up on the first and torn down on the last iteration? If there is no
+ // "last" iteration because the tests will repeat forever, always recreate the
+ // environments to avoid leaks in case one of the environments is using
+ // resources that are external to this process. Without this check there would
+ // be no way to clean up those external resources automatically.
+ const bool recreate_environments_when_repeating =
+ GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+ gtest_repeat_forever;
+
for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
// We want to preserve failures generated by ad-hoc test
// assertions executed before RUN_ALL_TESTS().
@@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() {
Timer timer;
// Shuffles test suites and tests if requested.
- if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+ if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
random()->Reseed(static_cast<uint32_t>(random_seed_));
// This should be done before calling OnTestIterationStart(),
// such that a test event listener can see the actual test order
@@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() {
// Runs each test suite if there is at least one test to run.
if (has_tests_to_run) {
- // Sets up all environments beforehand.
- repeater->OnEnvironmentsSetUpStart(*parent_);
- ForEach(environments_, SetUpEnvironment);
- repeater->OnEnvironmentsSetUpEnd(*parent_);
+ // Sets up all environments beforehand. If test environments aren't
+ // recreated for each iteration, only do so on the first iteration.
+ if (i == 0 || recreate_environments_when_repeating) {
+ repeater->OnEnvironmentsSetUpStart(*parent_);
+ ForEach(environments_, SetUpEnvironment);
+ repeater->OnEnvironmentsSetUpEnd(*parent_);
+ }
// Runs the tests only if there was no fatal failure or skip triggered
// during global set-up.
@@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() {
for (int test_index = 0; test_index < total_test_suite_count();
test_index++) {
GetMutableSuiteCase(test_index)->Run();
- if (GTEST_FLAG(fail_fast) &&
+ if (GTEST_FLAG_GET(fail_fast) &&
GetMutableSuiteCase(test_index)->Failed()) {
for (int j = test_index + 1; j < total_test_suite_count(); j++) {
GetMutableSuiteCase(j)->Skip();
@@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() {
}
}
- // Tears down all environments in reverse order afterwards.
- repeater->OnEnvironmentsTearDownStart(*parent_);
- std::for_each(environments_.rbegin(), environments_.rend(),
- TearDownEnvironment);
- repeater->OnEnvironmentsTearDownEnd(*parent_);
+ // Tears down all environments in reverse order afterwards. If test
+ // environments aren't recreated for each iteration, only do so on the
+ // last iteration.
+ if (i == repeat - 1 || recreate_environments_when_repeating) {
+ repeater->OnEnvironmentsTearDownStart(*parent_);
+ std::for_each(environments_.rbegin(), environments_.rend(),
+ TearDownEnvironment);
+ repeater->OnEnvironmentsTearDownEnd(*parent_);
+ }
}
elapsed_time_ = timer.Elapsed();
@@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() {
// (it's always safe to unshuffle the tests).
UnshuffleTests();
- if (GTEST_FLAG(shuffle)) {
+ if (GTEST_FLAG_GET(shuffle)) {
// Picks a new random seed for each iteration.
random_seed_ = GetNextRandomSeed(random_seed_);
}
@@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() {
// an error and exits. If in_subprocess_for_death_test, sharding is
// disabled because it must only be applied to the original test
// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
- const char* shard_index_env,
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
bool in_subprocess_for_death_test) {
if (in_subprocess_for_death_test) {
return false;
@@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env,
if (total_shards == -1 && shard_index == -1) {
return false;
} else if (total_shards == -1 && shard_index != -1) {
- const Message msg = Message()
- << "Invalid environment variables: you have "
- << kTestShardIndex << " = " << shard_index
- << ", but have left " << kTestTotalShards << " unset.\n";
+ const Message msg = Message() << "Invalid environment variables: you have "
+ << kTestShardIndex << " = " << shard_index
+ << ", but have left " << kTestTotalShards
+ << " unset.\n";
ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
fflush(stdout);
exit(EXIT_FAILURE);
} else if (total_shards != -1 && shard_index == -1) {
const Message msg = Message()
- << "Invalid environment variables: you have "
- << kTestTotalShards << " = " << total_shards
- << ", but have left " << kTestShardIndex << " unset.\n";
+ << "Invalid environment variables: you have "
+ << kTestTotalShards << " = " << total_shards
+ << ", but have left " << kTestShardIndex << " unset.\n";
ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
fflush(stdout);
exit(EXIT_FAILURE);
} else if (shard_index < 0 || shard_index >= total_shards) {
- const Message msg = Message()
- << "Invalid environment variables: we require 0 <= "
- << kTestShardIndex << " < " << kTestTotalShards
- << ", but you have " << kTestShardIndex << "=" << shard_index
- << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+ const Message msg =
+ Message() << "Invalid environment variables: we require 0 <= "
+ << kTestShardIndex << " < " << kTestTotalShards
+ << ", but you have " << kTestShardIndex << "=" << shard_index
+ << ", " << kTestTotalShards << "=" << total_shards << ".\n";
ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
fflush(stdout);
exit(EXIT_FAILURE);
@@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
// . Returns the number of tests that should run.
int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
- const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
- Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
- const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
- Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
+ const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+ ? Int32FromEnvOrDie(kTestTotalShards, -1)
+ : -1;
+ const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+ ? Int32FromEnvOrDie(kTestShardIndex, -1)
+ : -1;
+
+ const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+ GTEST_FLAG_GET(filter));
+ const UnitTestFilter disable_test_filter(kDisableTestFilter);
// num_runnable_tests are the number of tests that will
// run across all shards (i.e., match filter and are not disabled).
// num_selected_tests are the number of tests to be run on
@@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
const std::string test_name(test_info->name());
// A test is disabled if test suite name or test name matches
// kDisableTestFilter.
- const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
- test_suite_name, kDisableTestFilter) ||
- internal::UnitTestOptions::MatchesFilter(
- test_name, kDisableTestFilter);
+ const bool is_disabled =
+ disable_test_filter.MatchesName(test_suite_name) ||
+ disable_test_filter.MatchesName(test_name);
test_info->is_disabled_ = is_disabled;
- const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
- test_suite_name, test_name);
+ const bool matches_filter =
+ gtest_flag_filter.MatchesTest(test_suite_name, test_name);
test_info->matches_filter_ = matches_filter;
const bool is_runnable =
- (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+ (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
matches_filter;
const bool is_in_another_shard =
@@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() {
// For example, if Foo() calls Bar(), which in turn calls
// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
- int skip_count) {
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
// We pass skip_count + 1 to skip this wrapper function in addition
// to what the user really wants to skip.
return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
// suppress unreachable code warnings.
namespace {
class ClassUniqueToAlwaysTrue {};
-}
+} // namespace
bool IsTrue(bool condition) { return condition; }
@@ -6241,8 +6263,7 @@ bool AlwaysTrue() {
#if GTEST_HAS_EXCEPTIONS
// This condition is always false so AlwaysTrue() never actually throws,
// but it makes the compiler think that it may throw.
- if (IsTrue(false))
- throw ClassUniqueToAlwaysTrue();
+ if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
#endif // GTEST_HAS_EXCEPTIONS
return true;
}
@@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
// part can be omitted.
//
// Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
+static const char* ParseFlagValue(const char* str, const char* flag_name,
bool def_optional) {
// str and flag must not be NULL.
- if (str == nullptr || flag == nullptr) return nullptr;
+ if (str == nullptr || flag_name == nullptr) return nullptr;
// The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
- const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+ const std::string flag_str =
+ std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
const size_t flag_len = flag_str.length();
if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
@@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag,
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
// Gets the value of the flag as a string.
- const char* const value_str = ParseFlagValue(str, flag, true);
+ const char* const value_str = ParseFlagValue(str, flag_name, true);
// Aborts if the parsing failed.
if (value_str == nullptr) return false;
@@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
// Gets the value of the flag as a string.
- const char* const value_str = ParseFlagValue(str, flag, false);
+ const char* const value_str = ParseFlagValue(str, flag_name, false);
// Aborts if the parsing failed.
if (value_str == nullptr) return false;
// Sets *value to the value of the flag.
- return ParseInt32(Message() << "The value of flag --" << flag,
- value_str, value);
+ return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+ value);
}
// Parses a string for a string flag, in the form of "--flag=value".
@@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
// Gets the value of the flag as a string.
- const char* const value_str = ParseFlagValue(str, flag, false);
+ const char* const value_str = ParseFlagValue(str, flag_name, false);
// Aborts if the parsing failed.
if (value_str == nullptr) return false;
@@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) {
// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
// internal flags and do not trigger the help message.
static bool HasGoogleTestFlagPrefix(const char* str) {
- return (SkipPrefix("--", &str) ||
- SkipPrefix("-", &str) ||
+ return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
SkipPrefix("/", &str)) &&
!SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
(SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] =
"random_seed=@Y[NUMBER]@D\n"
" Random number seed to use for shuffling test orders (between 1 and\n"
" 99999, or 0 to use a seed based on the current time).\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "recreate_environments_when_repeating@D\n"
+ " Sets up and tears down the global test environment on each repeat\n"
+ " of the test.\n"
"\n"
"Test Output:\n"
" @G--" GTEST_FLAG_PREFIX_
@@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] =
" Generate a JSON or XML report in the given directory or with the "
"given\n"
" file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
+#if GTEST_CAN_STREAM_RESULTS_
" @G--" GTEST_FLAG_PREFIX_
"stream_result_to=@YHOST@G:@YPORT@D\n"
" Stream test results to the given server.\n"
-# endif // GTEST_CAN_STREAM_RESULTS_
+#endif // GTEST_CAN_STREAM_RESULTS_
"\n"
"Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
" @G--" GTEST_FLAG_PREFIX_
"death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
" Set the default death test style.\n"
-# endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
" @G--" GTEST_FLAG_PREFIX_
"break_on_failure@D\n"
" Turn assertion failures into debugger break-points.\n"
@@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] =
"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
static bool ParseGoogleTestFlag(const char* const arg) {
- return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
- &GTEST_FLAG(also_run_disabled_tests)) ||
- ParseBoolFlag(arg, kBreakOnFailureFlag,
- &GTEST_FLAG(break_on_failure)) ||
- ParseBoolFlag(arg, kCatchExceptionsFlag,
- &GTEST_FLAG(catch_exceptions)) ||
- ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
- ParseStringFlag(arg, kDeathTestStyleFlag,
- &GTEST_FLAG(death_test_style)) ||
- ParseBoolFlag(arg, kDeathTestUseFork,
- &GTEST_FLAG(death_test_use_fork)) ||
- ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
- ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
- ParseStringFlag(arg, kInternalRunDeathTestFlag,
- &GTEST_FLAG(internal_run_death_test)) ||
- ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
- ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
- ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
- ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
- ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
- ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
- ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
- ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
- ParseInt32Flag(arg, kStackTraceDepthFlag,
- &GTEST_FLAG(stack_trace_depth)) ||
- ParseStringFlag(arg, kStreamResultToFlag,
- &GTEST_FLAG(stream_result_to)) ||
- ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \
+ do { \
+ auto value = GTEST_FLAG_GET(flag_name); \
+ if (ParseFlag(arg, #flag_name, &value)) { \
+ GTEST_FLAG_SET(flag_name, value); \
+ return true; \
+ } \
+ } while (false)
+
+ GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+ GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+ GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+ GTEST_INTERNAL_PARSE_FLAG(color);
+ GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+ GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+ GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+ GTEST_INTERNAL_PARSE_FLAG(filter);
+ GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+ GTEST_INTERNAL_PARSE_FLAG(list_tests);
+ GTEST_INTERNAL_PARSE_FLAG(output);
+ GTEST_INTERNAL_PARSE_FLAG(brief);
+ GTEST_INTERNAL_PARSE_FLAG(print_time);
+ GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+ GTEST_INTERNAL_PARSE_FLAG(random_seed);
+ GTEST_INTERNAL_PARSE_FLAG(repeat);
+ GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+ GTEST_INTERNAL_PARSE_FLAG(shuffle);
+ GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+ GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+ GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+ return false;
}
#if GTEST_USE_OWN_FLAGFILE_FLAG_
static void LoadFlagsFromFile(const std::string& path) {
FILE* flagfile = posix::FOpen(path.c_str(), "r");
if (!flagfile) {
- GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+ GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
<< "\"";
}
std::string contents(ReadEntireFile(flagfile));
@@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) {
std::vector<std::string> lines;
SplitString(contents, '\n', &lines);
for (size_t i = 0; i < lines.size(); ++i) {
- if (lines[i].empty())
- continue;
- if (!ParseGoogleTestFlag(lines[i].c_str()))
- g_help_flag = true;
+ if (lines[i].empty()) continue;
+ if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
}
}
#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) {
// instantiated to either char or wchar_t.
template <typename CharType>
void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+ std::string flagfile_value;
for (int i = 1; i < *argc; i++) {
const std::string arg_string = StreamableToString(argv[i]);
const char* const arg = arg_string.c_str();
- using internal::ParseBoolFlag;
- using internal::ParseInt32Flag;
- using internal::ParseStringFlag;
+ using internal::ParseFlag;
bool remove_flag = false;
if (ParseGoogleTestFlag(arg)) {
remove_flag = true;
#if GTEST_USE_OWN_FLAGFILE_FLAG_
- } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
- LoadFlagsFromFile(GTEST_FLAG(flagfile));
+ } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+ GTEST_FLAG_SET(flagfile, flagfile_value);
+ LoadFlagsFromFile(flagfile_value);
remove_flag = true;
#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
- } else if (arg_string == "--help" || arg_string == "-h" ||
- arg_string == "-?" || arg_string == "/?" ||
- HasGoogleTestFlagPrefix(arg)) {
+ } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
// Both help flag and unrecognized Google Test flags (excluding
// internal ones) trigger help display.
g_help_flag = true;
@@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
// Parses the command line for Google Test flags, without initializing
// other parts of Google Test.
void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+ if (*argc > 0) {
+ // absl::ParseCommandLine() requires *argc > 0.
+ auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+ *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+ absl::flags_internal::UsageFlagsAction::kHandleUsage,
+ absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+ // Any command-line positional arguments not part of any command-line flag
+ // (or arguments to a flag) are copied back out to argv, with the program
+ // invocation name at position 0, and argc is resized. This includes
+ // positional arguments after the flag-terminating delimiter '--'.
+ // See https://abseil.io/docs/cpp/guides/flags.
+ std::copy(positional_args.begin(), positional_args.end(), argv);
+ if (static_cast<int>(positional_args.size()) < *argc) {
+ argv[positional_args.size()] = nullptr;
+ *argc = static_cast<int>(positional_args.size());
+ }
+ }
+#else
ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
// Fix the value of *_NSGetArgc() on macOS, but if and only if
// *_NSGetArgv() == argv
@@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
#if GTEST_HAS_ABSL
absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+ // When using the Abseil Flags library, set the program usage message to the
+ // help message, but remove the color-encoding from the message first.
+ absl::SetProgramUsageMessage(absl::StrReplaceAll(
+ kColorEncodedHelpMessage,
+ {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
#endif // GTEST_HAS_ABSL
ParseGoogleTestFlagsOnly(argc, argv);
@@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
void InitGoogleTest(int* argc, char** argv) {
#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
internal::InitGoogleTestImpl(argc, argv);
#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
}
@@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) {
void InitGoogleTest(int* argc, wchar_t** argv) {
#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
internal::InitGoogleTestImpl(argc, argv);
#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
}
@@ -6686,42 +6736,42 @@ void InitGoogleTest() {
#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
internal::InitGoogleTestImpl(&argc, argv);
#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
}
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+ std::initializer_list<const char*> environment_variables,
+ const char* fallback, char separator) {
+ for (const char* variable_name : environment_variables) {
+ const char* value = internal::posix::GetEnv(variable_name);
+ if (value != nullptr && value[0] != '\0') {
+ if (value[strlen(value) - 1] != separator) {
+ return std::string(value).append(1, separator);
+ }
+ return value;
+ }
+ }
+ return fallback;
+}
+#endif
+
std::string TempDir() {
#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#elif GTEST_OS_WINDOWS_MOBILE
- return "\\temp\\";
-#elif GTEST_OS_WINDOWS
- const char* temp_dir = internal::posix::GetEnv("TEMP");
- if (temp_dir == nullptr || temp_dir[0] == '\0') {
- return "\\temp\\";
- } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
- return temp_dir;
- } else {
- return std::string(temp_dir) + "\\";
- }
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
#elif GTEST_OS_LINUX_ANDROID
- const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
- if (temp_dir == nullptr || temp_dir[0] == '\0') {
- return "/data/local/tmp/";
- } else {
- return temp_dir;
- }
-#elif GTEST_OS_LINUX
- const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
- if (temp_dir == nullptr || temp_dir[0] == '\0') {
- return "/tmp/";
- } else {
- return temp_dir;
- }
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
#else
- return "/tmp/";
-#endif // GTEST_OS_WINDOWS_MOBILE
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
}
// Class ScopedTrace
@@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
}
// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
- GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
UnitTest::GetInstance()->PopGTestTrace();
}
diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc
index 46b27c3d7..44976375c 100644
--- a/third_party/googletest/src/src/gtest_main.cc
+++ b/third_party/googletest/src/src/gtest_main.cc
@@ -28,15 +28,14 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cstdio>
+
#include "gtest/gtest.h"
#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
#if GTEST_OS_ESP8266
extern "C" {
#endif
-void setup() {
- testing::InitGoogleTest();
-}
+void setup() { testing::InitGoogleTest(); }
void loop() { RUN_ALL_TESTS(); }
diff --git a/tools/3D-Reconstruction/MotionEST/Exhaust.py b/tools/3D-Reconstruction/MotionEST/Exhaust.py
index 2d6a4d811..d763de856 100644
--- a/tools/3D-Reconstruction/MotionEST/Exhaust.py
+++ b/tools/3D-Reconstruction/MotionEST/Exhaust.py
@@ -83,7 +83,7 @@ class ExhaustNeighbor(MotionEST):
self.beta = beta
self.metric = metric
super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size)
- self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+ self.assign = np.zeros((self.num_row, self.num_col), dtype=bool)
"""
estimate neighbor loss:
diff --git a/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
index 12bc53ff7..37305898a 100644
--- a/tools/3D-Reconstruction/MotionEST/GroundTruth.py
+++ b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
@@ -29,7 +29,7 @@ class GroundTruth(MotionEST):
def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
self.name = 'ground truth'
super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz)
- self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+ self.mask = np.zeros((self.num_row, self.num_col), dtype=bool)
if gt_path:
with open(gt_path) as gt_file:
lines = gt_file.readlines()
@@ -42,7 +42,7 @@ class GroundTruth(MotionEST):
self.mask[i, -j - 1] = True
continue
#the order of original file is flipped on the x axis
- self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int)
+ self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int)
else:
self.mf = mf
self.mask = mask
diff --git a/tools/3D-Reconstruction/MotionEST/MotionEST.py b/tools/3D-Reconstruction/MotionEST/MotionEST.py
index 0959530fa..fc393818d 100644
--- a/tools/3D-Reconstruction/MotionEST/MotionEST.py
+++ b/tools/3D-Reconstruction/MotionEST/MotionEST.py
@@ -28,8 +28,8 @@ class MotionEST(object):
self.ref_f = ref_f
self.blk_sz = blk_sz
#convert RGB to YUV
- self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int)
- self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int)
+ self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int)
+ self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int)
#frame size
self.width = self.cur_f.size[0]
self.height = self.cur_f.size[1]
diff --git a/tools/3D-Reconstruction/MotionEST/Util.py b/tools/3D-Reconstruction/MotionEST/Util.py
index 551881cfd..c2416163b 100644
--- a/tools/3D-Reconstruction/MotionEST/Util.py
+++ b/tools/3D-Reconstruction/MotionEST/Util.py
@@ -18,7 +18,7 @@ from PIL import Image, ImageDraw
def MSE(blk1, blk2):
return np.mean(
LA.norm(
- np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2))
+ np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2))
def drawMF(img, blk_sz, mf):
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 6889fdedd..3b3192362 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
tmp = near_mv_ref_cnts[CNT_NEAREST];
near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
near_mv_ref_cnts[CNT_NEAR] = tmp;
- tmp = near_mvs[CNT_NEAREST].as_int;
+ tmp = (int)near_mvs[CNT_NEAREST].as_int;
near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
- near_mvs[CNT_NEAR].as_int = tmp;
+ near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
}
/* Use near_mvs[0] to store the "best" MV */
diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c
index e46827b0e..b9da52084 100644
--- a/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/vp8/common/mips/dspr2/filter_dspr2.c
@@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
: [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
[Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
- : [src_pixels_per_line] "r"(src_pixels_per_line),
- [output_ptr] "r"(output_ptr));
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
__asm__ __volatile__(
"ulw %[Temp1], 0(%[src_ptr]) \n\t"
@@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
: [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
[Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
- : [src_pixels_per_line] "r"(src_pixels_per_line),
- [output_ptr] "r"(output_ptr));
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
__asm__ __volatile__(
"ulw %[Temp1], 0(%[src_ptr]) \n\t"
@@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
: [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
[Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
- : [src_pixels_per_line] "r"(src_pixels_per_line),
- [output_ptr] "r"(output_ptr));
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
output_ptr += 48;
}
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index ddc881a7f..7cb3c9869 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -69,12 +69,12 @@
#else // !(__mips == 64)
#define LD(psrc) \
({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ const uint8_t *psrc_ld = (const uint8_t *)(psrc); \
uint32_t val0_m, val1_m; \
uint64_t val_m = 0; \
\
- val0_m = LW(psrc_m); \
- val1_m = LW(psrc_m + 4); \
+ val0_m = LW(psrc_ld); \
+ val1_m = LW(psrc_ld + 4); \
\
val_m = (uint64_t)(val1_m); \
val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
@@ -122,10 +122,11 @@
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val_m; \
\
- asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t" \
- "lwl %[val_m], 3(%[psrc_m]) \n\t" \
- : [val_m] "=&r"(val_m) \
- : [psrc_m] "r"(psrc_m)); \
+ asm volatile( \
+ "lwr %[val_m], 0(%[psrc_m]) \n\t" \
+ "lwl %[val_m], 3(%[psrc_m]) \n\t" \
+ : [val_m] "=&r"(val_m) \
+ : [psrc_m] "r"(psrc_m)); \
\
val_m; \
})
@@ -136,10 +137,11 @@
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint64_t val_m = 0; \
\
- asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t" \
- "ldl %[val_m], 7(%[psrc_m]) \n\t" \
- : [val_m] "=&r"(val_m) \
- : [psrc_m] "r"(psrc_m)); \
+ asm volatile( \
+ "ldr %[val_m], 0(%[psrc_m]) \n\t" \
+ "ldl %[val_m], 7(%[psrc_m]) \n\t" \
+ : [val_m] "=&r"(val_m) \
+ : [psrc_m] "r"(psrc_m)); \
\
val_m; \
})
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index f2a18f0d9..673b2fbd5 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -15,6 +15,7 @@
#include <limits.h>
#include "./vpx_config.h"
+#include "vpx_ports/compiler_attributes.h"
#include "vpx_ports/mem.h"
#include "vpx/vp8dx.h"
#include "vpx/vpx_integer.h"
@@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
-static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br,
+ int probability) {
unsigned int bit = 0;
VP8_BD_VALUE value;
unsigned int split;
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 51817a2cb..3f459d623 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
tmp = cnt[CNT_NEAREST];
cnt[CNT_NEAREST] = cnt[CNT_NEAR];
cnt[CNT_NEAR] = tmp;
- tmp = near_mvs[CNT_NEAREST].as_int;
+ tmp = (int)near_mvs[CNT_NEAREST].as_int;
near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
- near_mvs[CNT_NEAR].as_int = tmp;
+ near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
}
if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index cf2c066d9..a6bedc4fa 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -11,6 +11,8 @@
#ifndef VPX_VP8_DECODER_ONYXD_INT_H_
#define VPX_VP8_DECODER_ONYXD_INT_H_
+#include <assert.h>
+
#include "vpx_config.h"
#include "vp8/common/onyxd.h"
#include "treereader.h"
@@ -136,6 +138,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(lval, expr) \
do { \
+ assert(pbi->common.error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
@@ -145,6 +148,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
#else
#define CHECK_MEM_ERROR(lval, expr) \
do { \
+ assert(pbi->common.error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 0e97af5f2..190b013af 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
#include <limits.h>
#include "vpx/vpx_encoder.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
#include "vpx_ports/system_state.h"
#include "bitstream.h"
@@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) {
vp8_mbsplit_encodings + x);
}
-void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
+void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w,
+ const TOKENEXTRA *p,
+ int xcount) {
const TOKENEXTRA *stop = p + xcount;
unsigned int split;
int shift;
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index c88ea1653..384bb2938 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
vp8_write(w, 1, p[mvpis_short]);
- do
+ do {
vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
- while (++i < 3);
+ } while (++i < 3);
i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
- do
+ do {
vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
- while (--i > 3);
+ } while (--i > 3);
if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]);
}
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index ed177e3cb..65d2681c9 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -903,9 +903,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor,
correction_factor = pow(error_term, power_term);
/* Clip range */
- correction_factor = (correction_factor < 0.05)
- ? 0.05
- : (correction_factor > 5.0) ? 5.0 : correction_factor;
+ correction_factor = (correction_factor < 0.05) ? 0.05
+ : (correction_factor > 5.0) ? 5.0
+ : correction_factor;
return correction_factor;
}
@@ -947,11 +947,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
}
cpi->twopass.est_max_qcorrection_factor =
- (cpi->twopass.est_max_qcorrection_factor < 0.1)
- ? 0.1
- : (cpi->twopass.est_max_qcorrection_factor > 10.0)
- ? 10.0
- : cpi->twopass.est_max_qcorrection_factor;
+ (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1
+ : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+ ? 10.0
+ : cpi->twopass.est_max_qcorrection_factor;
}
/* Corrections for higher compression speed settings
@@ -1178,10 +1177,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
} else {
current_spend_ratio = (double)cpi->long_rolling_actual_bits /
(double)cpi->long_rolling_target_bits;
- current_spend_ratio =
- (current_spend_ratio > 10.0)
- ? 10.0
- : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+ current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0
+ : (current_spend_ratio < 0.1) ? 0.1
+ : current_spend_ratio;
}
/* Calculate a correction factor based on the quality of prediction in
@@ -1968,11 +1966,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
cpi->twopass.gf_group_bits =
- (cpi->twopass.gf_group_bits < 0)
- ? 0
- : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
- ? cpi->twopass.kf_group_bits
- : cpi->twopass.gf_group_bits;
+ (cpi->twopass.gf_group_bits < 0) ? 0
+ : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+ ? cpi->twopass.kf_group_bits
+ : cpi->twopass.gf_group_bits;
/* Clip cpi->twopass.gf_group_bits based on user supplied data rate
* variability limit (cpi->oxcf.two_pass_vbrmax_section)
diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/vp8_quantize_lsx.c
index 75889192a..75889192a 100644
--- a/vp8/encoder/loongarch/quantize_lsx.c
+++ b/vp8/encoder/loongarch/vp8_quantize_lsx.c
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index ae092c66e..b92e2135e 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -204,20 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
/* returns distortion + motion vector cost */
#define ERR(r, c) (MVC(r, c) + DIST(r, c))
/* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c) \
- do { \
- IFMVCV(r, c, \
- { \
- thismse = DIST(r, c); \
- if ((v = (MVC(r, c) + thismse)) < besterr) { \
- besterr = v; \
- br = r; \
- bc = c; \
- *distortion = thismse; \
- *sse1 = sse; \
- } \
- }, \
- v = UINT_MAX;) \
+#define CHECK_BETTER(v, r, c) \
+ do { \
+ IFMVCV( \
+ r, c, \
+ { \
+ thismse = DIST(r, c); \
+ if ((v = (MVC(r, c) + thismse)) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ }, \
+ v = UINT_MAX;) \
} while (0)
int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index ffb3867dd..4bbeadef0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -328,8 +328,8 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
// for any "new" layers. For "existing" layers, let them inherit the parameters
// from the previous layer state (at the same layer #). In future we may want
// to better map the previous layer state(s) to the "new" ones.
-static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
- const int prev_num_layers) {
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+ const int prev_num_layers) {
int i;
double prev_layer_framerate = 0;
const int curr_num_layers = cpi->oxcf.number_of_layers;
@@ -1643,7 +1643,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
cpi->temporal_layer_id = 0;
}
cpi->temporal_pattern_counter = 0;
- reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+ vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
}
if (!cpi->initial_width) {
@@ -4202,11 +4202,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
}
/* Clamp cpi->zbin_over_quant */
- cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low)
- ? zbin_oq_low
- : (cpi->mb.zbin_over_quant > zbin_oq_high)
- ? zbin_oq_high
- : cpi->mb.zbin_over_quant;
+ cpi->mb.zbin_over_quant =
+ (cpi->mb.zbin_over_quant < zbin_oq_low) ? zbin_oq_low
+ : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high
+ : cpi->mb.zbin_over_quant;
Loop = Q != last_q;
} else {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 424f51b18..46a17913a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -11,7 +11,9 @@
#ifndef VPX_VP8_ENCODER_ONYX_INT_H_
#define VPX_VP8_ENCODER_ONYX_INT_H_
+#include <assert.h>
#include <stdio.h>
+
#include "vpx_config.h"
#include "vp8/common/onyx.h"
#include "treewriter.h"
@@ -483,7 +485,7 @@ typedef struct VP8_COMP {
unsigned char *segmentation_map;
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
- int segment_encode_breakout[MAX_MB_SEGMENTS];
+ unsigned int segment_encode_breakout[MAX_MB_SEGMENTS];
unsigned char *active_map;
unsigned int active_map_enabled;
@@ -711,6 +713,8 @@ void vp8_initialize_enc(void);
void vp8_alloc_compressor_data(VP8_COMP *cpi);
int vp8_reverse_trans(int x);
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+ const int prev_num_layers);
void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
const int layer,
double prev_layer_framerate);
@@ -730,6 +734,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(lval, expr) \
do { \
+ assert(cpi->common.error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
@@ -739,6 +744,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
#else
#define CHECK_MEM_ERROR(lval, expr) \
do { \
+ assert(cpi->common.error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 5821fc734..bbddacf8f 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd,
unsigned int q2dc = xd->block[24].dequant[0];
/* If theres is no codeable 2nd order dc
or a very small uniform pixel change change */
- if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) {
+ if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) {
/* Check u and v to make sure skip is ok */
unsigned int sse2 = VP8_UVSSE(x);
if (sse2 * 2 < threshold) {
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index 89cad5335..f35b93016 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
_mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
const __m128i hgfedcba =
_mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
- unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+ unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
return sum_diff;
}
diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
index 6d03365fc..4c2d24cc2 100644
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -13,8 +13,11 @@
#include "./vp8_rtcd.h"
#include "vp8/encoder/block.h"
#include "vpx_ports/bitops.h" /* get_lsb */
+#include "vpx_ports/compiler_attributes.h"
-void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask.
+VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b,
+ BLOCKD *d) {
int eob = -1;
short *zbin_boost_ptr = b->zrun_zbin_boost;
__m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6d88e5154..55a77ba7e 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -275,7 +275,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
void *user_priv, long deadline) {
volatile vpx_codec_err_t res;
volatile unsigned int resolution_change = 0;
- unsigned int w, h;
+ volatile unsigned int w, h;
if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
return 0;
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index 2f23c5b1d..f3f42529d 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -92,6 +92,7 @@ void VP8RateControlRTC::UpdateRateControl(
const VP8RateControlRtcConfig &rc_cfg) {
VP8_COMMON *cm = &cpi_->common;
VP8_CONFIG *oxcf = &cpi_->oxcf;
+ const unsigned int prev_number_of_layers = oxcf->number_of_layers;
vpx_clear_system_state();
cm->Width = rc_cfg.width;
cm->Height = rc_cfg.height;
@@ -124,17 +125,33 @@ void VP8RateControlRTC::UpdateRateControl(
static_cast<int>(cpi_->output_framerate);
}
- if (oxcf->number_of_layers > 1) {
+ if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
sizeof(rc_cfg.layer_target_bitrate));
memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
sizeof(rc_cfg.ts_rate_decimator));
- oxcf->periodicity = 2;
+ if (cm->current_video_frame == 0) {
+ double prev_layer_framerate = 0;
+ for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+ vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+ prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+ }
+ } else if (oxcf->number_of_layers != prev_number_of_layers) {
+ // The number of temporal layers has changed, so reset/initialize the
+ // temporal layer context for the new layer configuration: this means
+ // calling vp8_reset_temporal_layer_change() below.
+
+ // Start at the base of the pattern cycle, so set the layer id to 0 and
+ // reset the temporal pattern counter.
+ // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls
+ // the layer_id) so remove.
+ if (cpi_->temporal_layer_id > 0) {
+ cpi_->temporal_layer_id = 0;
+ }
+ cpi_->temporal_pattern_counter = 0;
- double prev_layer_framerate = 0;
- for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
- vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
- prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+ vp8_reset_temporal_layer_change(cpi_, oxcf,
+ static_cast<int>(prev_number_of_layers));
}
}
@@ -146,20 +163,24 @@ void VP8RateControlRTC::UpdateRateControl(
cm->MBs = cm->mb_rows * cm->mb_cols;
cm->mode_info_stride = cm->mb_cols + 1;
- oxcf->starting_buffer_level =
- rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
- /* Set or reset optimal and maximum buffer levels. */
- if (oxcf->optimal_buffer_level == 0) {
- oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
- } else {
- oxcf->optimal_buffer_level =
- rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
- }
- if (oxcf->maximum_buffer_size == 0) {
- oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
- } else {
- oxcf->maximum_buffer_size =
- rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+ // For temporal layers: starting/maximum/optimal_buffer_level is already set
+ // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change().
+ if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) {
+ oxcf->starting_buffer_level =
+ rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+ /* Set or reset optimal and maximum buffer levels. */
+ if (oxcf->optimal_buffer_level == 0) {
+ oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+ } else {
+ oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level,
+ oxcf->target_bandwidth, 1000);
+ }
+ if (oxcf->maximum_buffer_size == 0) {
+ oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+ } else {
+ oxcf->maximum_buffer_size =
+ rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+ }
}
if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 5744cbabc..b4b3fda9e 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -125,8 +125,8 @@ VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
endif
# common (loongarch LSX intrinsics)
-VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c
VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 3cec53bfd..8d2bed38e 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -49,6 +49,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(cm, lval, expr) \
do { \
+ assert(&(cm)->error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
@@ -58,6 +59,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
#else
#define CHECK_MEM_ERROR(cm, lval, expr) \
do { \
+ assert(&(cm)->error.setjmp); \
(lval) = (expr); \
if (!(lval)) \
vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 95d6029f3..765cb1172 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1180,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
}
// Disable filtering on the leftmost column
- border_mask = ~(mi_col == 0 ? 1 : 0);
+ border_mask = ~(mi_col == 0 ? 1u : 0u);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
highbd_filter_selectively_vert(
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4da0b6675..f4bd9772c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -129,10 +129,10 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_block_error avx2 sse2/;
@@ -175,7 +175,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
# Motion search
#
add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+specialize qw/vp9_diamond_search_sad avx neon/;
#
# Apply temporal filter
@@ -196,11 +196,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# ENCODEMB INVOKE
add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_highbd_quantize_fp avx2 neon/;
add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+ specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
# fdct functions
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_highbd_fht4x4 neon/;
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 0fef26351..8bea61dea 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t,
959, 990, 991, 1022, 0, 0,
};
+// Add 1 to iscan values. This represents the EOB position instead of the index.
DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
- 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+ 1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
- 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+ 1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
- 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+ 1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
- 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
- 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
- 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60,
- 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+ 1, 4, 9, 16, 23, 33, 41, 48, 2, 6, 12, 19, 27, 35, 45, 52,
+ 3, 8, 14, 21, 29, 39, 47, 55, 5, 11, 17, 25, 32, 42, 51, 57,
+ 7, 13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61,
+ 15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
- 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39,
- 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
- 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
- 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+ 1, 2, 3, 6, 9, 13, 20, 25, 4, 5, 8, 11, 16, 21, 31, 40,
+ 7, 10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53,
+ 19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60,
+ 33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
- 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44,
- 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53,
- 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
- 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+ 1, 3, 6, 10, 15, 23, 32, 38, 2, 5, 9, 14, 20, 27, 39, 45,
+ 4, 7, 11, 18, 25, 31, 43, 50, 8, 12, 16, 22, 30, 37, 48, 54,
+ 13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61,
+ 26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
- 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
- 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
- 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216,
- 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218,
- 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223,
- 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228,
- 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230,
- 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235,
- 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237,
- 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
- 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
- 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
- 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
- 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
- 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
- 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+ 1, 5, 12, 21, 32, 44, 60, 76, 86, 110, 131, 151, 166, 182, 196, 199,
+ 2, 7, 15, 24, 35, 48, 65, 82, 96, 115, 136, 154, 172, 189, 202, 213,
+ 3, 9, 17, 26, 39, 53, 68, 84, 102, 117, 137, 158, 173, 191, 206, 217,
+ 4, 11, 19, 30, 42, 56, 72, 90, 104, 120, 142, 160, 177, 195, 209, 219,
+ 6, 13, 22, 33, 46, 59, 75, 94, 105, 124, 145, 165, 180, 197, 211, 224,
+ 8, 16, 27, 38, 50, 64, 79, 97, 113, 130, 147, 167, 183, 201, 216, 229,
+ 10, 20, 29, 40, 55, 70, 87, 103, 118, 133, 152, 171, 188, 207, 221, 231,
+ 14, 25, 36, 47, 61, 74, 92, 109, 123, 138, 155, 175, 190, 208, 225, 236,
+ 18, 31, 41, 54, 67, 83, 99, 116, 127, 143, 162, 181, 198, 214, 228, 238,
+ 23, 37, 49, 63, 77, 93, 106, 121, 134, 148, 168, 187, 204, 220, 233, 241,
+ 28, 45, 57, 71, 85, 100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245,
+ 34, 52, 69, 80, 95, 111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248,
+ 43, 62, 78, 91, 107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252,
+ 51, 73, 88, 101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254,
+ 58, 81, 98, 112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255,
+ 66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
- 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76,
- 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99,
- 115, 130, 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103,
- 119, 142, 167, 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100,
- 116, 135, 161, 185, 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94,
- 112, 133, 154, 179, 205, 28, 34, 39, 45, 50, 58, 67, 77, 87, 96,
- 106, 121, 146, 169, 196, 212, 41, 46, 49, 56, 63, 70, 79, 90, 98,
- 107, 122, 138, 159, 182, 207, 222, 52, 57, 62, 69, 75, 83, 93, 102,
- 110, 120, 134, 150, 176, 195, 215, 226, 66, 71, 78, 82, 91, 97, 108,
- 113, 127, 136, 148, 168, 188, 202, 221, 232, 80, 89, 92, 101, 105, 114,
- 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95, 104, 109, 117, 123,
- 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
- 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
- 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
- 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
- 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
- 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
- 255,
+ 1, 2, 3, 5, 7, 10, 13, 18, 23, 30, 37, 44, 55, 65, 77,
+ 87, 4, 6, 8, 12, 16, 20, 26, 33, 39, 49, 60, 69, 85, 100,
+ 116, 131, 9, 11, 14, 19, 24, 28, 34, 43, 52, 61, 73, 89, 104,
+ 120, 143, 168, 15, 17, 21, 27, 32, 38, 45, 54, 62, 74, 86, 101,
+ 117, 136, 162, 186, 22, 25, 31, 36, 41, 48, 56, 66, 75, 82, 95,
+ 113, 134, 155, 180, 206, 29, 35, 40, 46, 51, 59, 68, 78, 88, 97,
+ 107, 122, 147, 170, 197, 213, 42, 47, 50, 57, 64, 71, 80, 91, 99,
+ 108, 123, 139, 160, 183, 208, 223, 53, 58, 63, 70, 76, 84, 94, 103,
+ 111, 121, 135, 151, 177, 196, 216, 227, 67, 72, 79, 83, 92, 98, 109,
+ 114, 128, 137, 149, 169, 189, 203, 222, 233, 81, 90, 93, 102, 106, 115,
+ 126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96, 105, 110, 118, 124,
+ 129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130,
+ 141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138,
+ 146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150,
+ 157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153,
+ 164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254,
+ 159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255,
+ 256,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
- 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166,
- 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154,
- 178, 196, 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148,
- 164, 186, 201, 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127,
- 153, 169, 193, 208, 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114,
- 133, 161, 176, 198, 214, 15, 21, 26, 34, 43, 52, 65, 77, 91, 106,
- 120, 140, 165, 185, 205, 221, 22, 27, 32, 41, 48, 60, 73, 85, 99,
- 116, 130, 151, 175, 190, 211, 225, 29, 35, 42, 49, 59, 69, 81, 95,
- 108, 125, 139, 155, 182, 197, 217, 229, 38, 45, 51, 61, 68, 80, 93,
- 105, 118, 134, 150, 168, 191, 207, 223, 234, 50, 56, 63, 74, 83, 94,
- 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62, 70, 76, 87, 97,
- 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75, 82, 90, 102,
- 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89, 100, 111,
- 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
- 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
- 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
- 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
- 255,
+ 1, 3, 6, 10, 18, 25, 37, 45, 56, 73, 89, 105, 129, 144, 167,
+ 180, 2, 5, 9, 14, 21, 31, 41, 55, 67, 80, 97, 114, 142, 155,
+ 179, 197, 4, 8, 12, 19, 26, 34, 47, 58, 72, 87, 102, 120, 149,
+ 165, 187, 202, 7, 13, 17, 24, 32, 40, 54, 65, 79, 93, 111, 128,
+ 154, 170, 194, 209, 11, 15, 20, 29, 38, 48, 59, 68, 85, 99, 115,
+ 134, 162, 177, 199, 215, 16, 22, 27, 35, 44, 53, 66, 78, 92, 107,
+ 121, 141, 166, 186, 206, 222, 23, 28, 33, 42, 49, 61, 74, 86, 100,
+ 117, 131, 152, 176, 191, 212, 226, 30, 36, 43, 50, 60, 70, 82, 96,
+ 109, 126, 140, 156, 183, 198, 218, 230, 39, 46, 52, 62, 69, 81, 94,
+ 106, 119, 135, 151, 169, 192, 208, 224, 235, 51, 57, 64, 75, 84, 95,
+ 110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63, 71, 77, 88, 98,
+ 108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76, 83, 91, 103,
+ 113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90, 101, 112,
+ 124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116,
+ 127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122,
+ 136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255,
+ 138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254,
+ 256,
};
DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
- 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145,
- 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356,
- 377, 405, 455, 471, 495, 527, 1, 4, 8, 15, 22, 30, 45,
- 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, 228, 234, 237,
- 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, 3,
- 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189,
- 208, 217, 224, 231, 235, 238, 273, 297, 316, 329, 375, 403, 425,
- 440, 493, 525, 550, 567, 6, 11, 16, 23, 31, 43, 60, 73,
- 90, 109, 126, 150, 173, 196, 211, 220, 226, 232, 236, 239, 296,
- 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, 9, 14,
- 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214,
- 223, 244, 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523,
- 582, 596, 617, 645, 13, 20, 26, 35, 44, 54, 72, 85, 105,
- 123, 140, 163, 182, 205, 216, 225, 254, 271, 294, 314, 353, 373,
- 400, 423, 468, 491, 522, 548, 595, 616, 644, 666, 21, 27, 33,
- 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227,
- 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615,
- 643, 665, 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139,
- 159, 178, 197, 212, 221, 230, 292, 312, 326, 334, 398, 421, 437,
- 446, 520, 546, 564, 574, 642, 664, 679, 687, 34, 40, 46, 56,
- 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, 340,
- 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705,
- 723, 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177,
- 194, 252, 268, 290, 311, 351, 370, 396, 420, 466, 488, 518, 545,
- 593, 613, 640, 663, 704, 722, 746, 765, 51, 59, 66, 76, 89,
- 99, 119, 131, 149, 168, 181, 200, 267, 289, 310, 325, 369, 395,
- 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, 745, 764,
- 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207,
- 288, 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638,
- 661, 677, 686, 744, 763, 776, 783, 70, 79, 86, 97, 108, 122,
- 137, 155, 242, 251, 266, 287, 339, 350, 368, 393, 452, 465, 486,
- 515, 580, 592, 611, 637, 692, 703, 720, 743, 788, 798, 813, 833,
- 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, 308, 349,
- 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719,
- 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169,
- 185, 264, 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561,
- 609, 635, 659, 676, 718, 741, 761, 775, 811, 831, 847, 858, 117,
- 128, 136, 148, 160, 175, 188, 198, 284, 306, 322, 332, 390, 415,
- 433, 444, 512, 540, 560, 572, 634, 658, 675, 685, 740, 760, 774,
- 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, 249, 263, 283,
- 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, 691,
- 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166,
- 174, 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510,
- 539, 589, 607, 632, 657, 700, 716, 738, 759, 795, 809, 828, 845,
- 874, 886, 902, 915, 176, 187, 195, 202, 261, 281, 304, 321, 363,
- 387, 413, 432, 481, 509, 538, 559, 606, 631, 656, 674, 715, 737,
- 758, 773, 808, 827, 844, 856, 885, 901, 914, 923, 192, 199, 206,
- 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, 558, 571,
- 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900,
- 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461,
- 480, 507, 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807,
- 825, 866, 873, 884, 899, 930, 936, 945, 957, 246, 259, 278, 302,
- 345, 361, 384, 411, 460, 479, 506, 536, 587, 604, 628, 654, 698,
- 713, 734, 756, 793, 806, 824, 842, 872, 883, 898, 912, 935, 944,
- 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, 478, 505, 535,
- 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, 854,
- 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382,
- 409, 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754,
- 770, 780, 822, 840, 853, 861, 896, 910, 920, 926, 954, 964, 971,
- 975, 336, 344, 359, 381, 449, 459, 477, 503, 577, 586, 602, 625,
- 689, 697, 711, 731, 785, 792, 804, 821, 865, 871, 881, 895, 929,
- 934, 942, 953, 977, 981, 987, 995, 343, 358, 380, 408, 458, 476,
- 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, 803, 820,
- 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001,
- 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709,
- 729, 752, 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951,
- 962, 970, 985, 993, 1000, 1005, 378, 406, 427, 441, 500, 531, 554,
- 569, 622, 649, 669, 682, 728, 751, 768, 779, 818, 837, 851, 860,
- 892, 907, 918, 925, 950, 961, 969, 974, 992, 999, 1004, 1007, 448,
- 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, 727, 784, 790,
- 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, 984,
- 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648,
- 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931,
- 938, 948, 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497,
- 529, 553, 597, 619, 647, 668, 706, 725, 749, 767, 799, 815, 835,
- 850, 876, 889, 905, 917, 937, 947, 959, 968, 982, 989, 997, 1003,
- 1011, 1015, 1019, 1022, 496, 528, 552, 568, 618, 646, 667, 681, 724,
- 748, 766, 778, 814, 834, 849, 859, 888, 904, 916, 924, 946, 958,
- 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, 1023,
+ 1, 3, 6, 11, 18, 26, 39, 48, 63, 84, 102, 122, 146,
+ 171, 194, 205, 211, 220, 230, 234, 246, 258, 276, 300, 343, 357,
+ 378, 406, 456, 472, 496, 528, 2, 5, 9, 16, 23, 31, 46,
+ 59, 75, 93, 113, 134, 159, 185, 204, 216, 223, 229, 235, 238,
+ 257, 275, 299, 318, 356, 377, 405, 427, 471, 495, 527, 552, 4,
+ 8, 13, 19, 29, 37, 53, 65, 83, 103, 119, 143, 165, 190,
+ 209, 218, 225, 232, 236, 239, 274, 298, 317, 330, 376, 404, 426,
+ 441, 494, 526, 551, 568, 7, 12, 17, 24, 32, 44, 61, 74,
+ 91, 110, 127, 151, 174, 197, 212, 221, 227, 233, 237, 240, 297,
+ 316, 329, 336, 403, 425, 440, 448, 525, 550, 567, 576, 10, 15,
+ 20, 30, 38, 51, 66, 79, 96, 117, 135, 158, 180, 202, 215,
+ 224, 245, 256, 273, 296, 342, 355, 375, 402, 455, 470, 493, 524,
+ 583, 597, 618, 646, 14, 21, 27, 36, 45, 55, 73, 86, 106,
+ 124, 141, 164, 183, 206, 217, 226, 255, 272, 295, 315, 354, 374,
+ 401, 424, 469, 492, 523, 549, 596, 617, 645, 667, 22, 28, 34,
+ 43, 54, 64, 81, 95, 114, 133, 152, 173, 191, 210, 219, 228,
+ 271, 294, 314, 328, 373, 400, 423, 439, 491, 522, 548, 566, 616,
+ 644, 666, 681, 25, 33, 40, 49, 58, 72, 89, 105, 121, 140,
+ 160, 179, 198, 213, 222, 231, 293, 313, 327, 335, 399, 422, 438,
+ 447, 521, 547, 565, 575, 643, 665, 680, 688, 35, 41, 47, 57,
+ 69, 82, 97, 112, 131, 148, 168, 187, 244, 254, 270, 292, 341,
+ 353, 372, 398, 454, 468, 490, 520, 582, 595, 615, 642, 694, 706,
+ 724, 748, 42, 50, 56, 68, 78, 92, 108, 125, 139, 162, 178,
+ 195, 253, 269, 291, 312, 352, 371, 397, 421, 467, 489, 519, 546,
+ 594, 614, 641, 664, 705, 723, 747, 766, 52, 60, 67, 77, 90,
+ 100, 120, 132, 150, 169, 182, 201, 268, 290, 311, 326, 370, 396,
+ 420, 437, 488, 518, 545, 564, 613, 640, 663, 679, 722, 746, 765,
+ 778, 62, 70, 76, 88, 101, 115, 130, 145, 163, 181, 192, 208,
+ 289, 310, 325, 334, 395, 419, 436, 446, 517, 544, 563, 574, 639,
+ 662, 678, 687, 745, 764, 777, 784, 71, 80, 87, 98, 109, 123,
+ 138, 156, 243, 252, 267, 288, 340, 351, 369, 394, 453, 466, 487,
+ 516, 581, 593, 612, 638, 693, 704, 721, 744, 789, 799, 814, 834,
+ 85, 94, 104, 111, 126, 142, 155, 172, 251, 266, 287, 309, 350,
+ 368, 393, 418, 465, 486, 515, 543, 592, 611, 637, 661, 703, 720,
+ 743, 763, 798, 813, 833, 849, 99, 107, 116, 128, 144, 157, 170,
+ 186, 265, 286, 308, 324, 367, 392, 417, 435, 485, 514, 542, 562,
+ 610, 636, 660, 677, 719, 742, 762, 776, 812, 832, 848, 859, 118,
+ 129, 137, 149, 161, 176, 189, 199, 285, 307, 323, 333, 391, 416,
+ 434, 445, 513, 541, 561, 573, 635, 659, 676, 686, 741, 761, 775,
+ 783, 831, 847, 858, 864, 136, 147, 153, 166, 242, 250, 264, 284,
+ 339, 349, 366, 390, 452, 464, 484, 512, 580, 591, 609, 634, 692,
+ 702, 718, 740, 788, 797, 811, 830, 868, 876, 888, 904, 154, 167,
+ 175, 184, 249, 263, 283, 306, 348, 365, 389, 415, 463, 483, 511,
+ 540, 590, 608, 633, 658, 701, 717, 739, 760, 796, 810, 829, 846,
+ 875, 887, 903, 916, 177, 188, 196, 203, 262, 282, 305, 322, 364,
+ 388, 414, 433, 482, 510, 539, 560, 607, 632, 657, 675, 716, 738,
+ 759, 774, 809, 828, 845, 857, 886, 902, 915, 924, 193, 200, 207,
+ 214, 281, 304, 321, 332, 387, 413, 432, 444, 509, 538, 559, 572,
+ 631, 656, 674, 685, 737, 758, 773, 782, 827, 844, 856, 863, 901,
+ 914, 923, 928, 241, 248, 261, 280, 338, 347, 363, 386, 451, 462,
+ 481, 508, 579, 589, 606, 630, 691, 700, 715, 736, 787, 795, 808,
+ 826, 867, 874, 885, 900, 931, 937, 946, 958, 247, 260, 279, 303,
+ 346, 362, 385, 412, 461, 480, 507, 537, 588, 605, 629, 655, 699,
+ 714, 735, 757, 794, 807, 825, 843, 873, 884, 899, 913, 936, 945,
+ 957, 967, 259, 278, 302, 320, 361, 384, 411, 431, 479, 506, 536,
+ 558, 604, 628, 654, 673, 713, 734, 756, 772, 806, 824, 842, 855,
+ 883, 898, 912, 922, 944, 956, 966, 973, 277, 301, 319, 331, 383,
+ 410, 430, 443, 505, 535, 557, 571, 627, 653, 672, 684, 733, 755,
+ 771, 781, 823, 841, 854, 862, 897, 911, 921, 927, 955, 965, 972,
+ 976, 337, 345, 360, 382, 450, 460, 478, 504, 578, 587, 603, 626,
+ 690, 698, 712, 732, 786, 793, 805, 822, 866, 872, 882, 896, 930,
+ 935, 943, 954, 978, 982, 988, 996, 344, 359, 381, 409, 459, 477,
+ 503, 534, 586, 602, 625, 652, 697, 711, 731, 754, 792, 804, 821,
+ 840, 871, 881, 895, 910, 934, 942, 953, 964, 981, 987, 995, 1002,
+ 358, 380, 408, 429, 476, 502, 533, 556, 601, 624, 651, 671, 710,
+ 730, 753, 770, 803, 820, 839, 853, 880, 894, 909, 920, 941, 952,
+ 963, 971, 986, 994, 1001, 1006, 379, 407, 428, 442, 501, 532, 555,
+ 570, 623, 650, 670, 683, 729, 752, 769, 780, 819, 838, 852, 861,
+ 893, 908, 919, 926, 951, 962, 970, 975, 993, 1000, 1005, 1008, 449,
+ 458, 475, 500, 577, 585, 600, 622, 689, 696, 709, 728, 785, 791,
+ 802, 818, 865, 870, 879, 892, 929, 933, 940, 950, 977, 980, 985,
+ 992, 1009, 1011, 1014, 1018, 457, 474, 499, 531, 584, 599, 621, 649,
+ 695, 708, 727, 751, 790, 801, 817, 837, 869, 878, 891, 907, 932,
+ 939, 949, 961, 979, 984, 991, 999, 1010, 1013, 1017, 1021, 473, 498,
+ 530, 554, 598, 620, 648, 669, 707, 726, 750, 768, 800, 816, 836,
+ 851, 877, 890, 906, 918, 938, 948, 960, 969, 983, 990, 998, 1004,
+ 1012, 1016, 1020, 1023, 497, 529, 553, 569, 619, 647, 668, 682, 725,
+ 749, 767, 779, 815, 835, 850, 860, 889, 905, 917, 925, 947, 959,
+ 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024,
};
const scan_order vp9_default_scan_orders[TX_SIZES] = {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8a8d2ad86..db3e74663 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
zero_mv_pair(mv);
break;
}
- default: { return 0; }
+ default: {
+ return 0;
+ }
}
return ret;
}
@@ -755,7 +757,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
best_sub8x8, is_compound, allow_hp, r)) {
xd->corrupted |= 1;
- break;
+ return;
}
if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index c2e6b3d54..3ed1bd6ff 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
int16_t dqv = dq[0];
const uint8_t *const cat6_prob =
#if CONFIG_VP9_HIGHBITDEPTH
- (xd->bd == VPX_BITS_12)
- ? vp9_cat6_prob_high12
- : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+ (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12
+ : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2
+ :
#endif // CONFIG_VP9_HIGHBITDEPTH
- vp9_cat6_prob;
+ vp9_cat6_prob;
const int cat6_bits =
#if CONFIG_VP9_HIGHBITDEPTH
- (xd->bd == VPX_BITS_12) ? 18
- : (xd->bd == VPX_BITS_10) ? 16 :
+ (xd->bd == VPX_BITS_12) ? 18
+ : (xd->bd == VPX_BITS_10) ? 16
+ :
#endif // CONFIG_VP9_HIGHBITDEPTH
- 14;
+ 14;
// Keep value, range, and count as locals. The compiler produces better
// results with the locals than using r directly.
BD_VALUE value = r->value;
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index a07a1608d..5961be5f3 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -18,28 +18,30 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
int stride) {
- // { 0, 1, 1, 1, 1, 1, 1, 1 };
- const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7);
- // { 1, 0, 0, 0, 0, 0, 0, 0 };
- const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7);
- int16x8_t mask;
+ // { 0, 1, 1, 1 };
+ const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+ // { 1, 0, 0, 0 };
+ const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+ int16x4_t mask;
int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
- in[0] = vcombine_s16(input_0, input_1);
- in[1] = vcombine_s16(input_2, input_3);
-
// Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
// one non-zero first elements
- mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a));
- in[0] = vaddq_s16(in[0], mask);
- in[0] = vaddq_s16(in[0], nonzero_bias_b);
+ mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+ input_0 = vadd_s16(input_0, mask);
+ input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+ in[0] = vcombine_s16(input_0, input_1);
+ in[1] = vcombine_s16(input_2, input_3);
}
static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
@@ -53,72 +55,54 @@ static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
}
static INLINE void fadst4x4_neon(int16x8_t *in) {
- int32x4_t u0, u1, u2, u3;
- int16x4_t out_0, out_1, out_2, out_3;
- const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+ int32x4_t u[4], t[4];
+ int16x4_t s[4], out[4];
- const int16x4_t s0 = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 |
- const int16x4_t s1 = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 |
- const int16x4_t s2 = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 |
- const int16x4_t s3 = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 |
+ s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 |
+ s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 |
+ s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 |
+ s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 |
- // s0 * sinpi_1_9, s0 * sinpi_4_9
// Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
- const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9);
- const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9);
- // s1 * sinpi_1_9, s1 * sinpi_2_9
- const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9);
- const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9);
- // s2 * sinpi_3_9
- const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9);
- // s3 * sinpi_2_9, s3 * sinpi_4_9
- const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9);
- const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9);
-
- // (s0 + s1) * sinpi_3_9
- const int32x4_t s0_p_s1 = vaddl_s16(s0, s1);
- const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3);
-
- // s_0 * sinpi_1_9 + s_1 * sinpi_2_9
- // s_0 * sinpi_4_9 - s_1 * sinpi_1_9
- const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9);
- const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9);
- /*
- * t0 = s0s1_9 + s1s2_9 + s3s4_9
- * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
- * t2 = s0s4_9 - s1s1_9 + s3s2_9
- * t3 = s2s3_9
- */
- const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9);
- const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9);
- const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9);
- const int32x4_t t3 = s2s3_9;
+ // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+ t[0] = vmull_n_s16(s[0], sinpi_1_9);
+ t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+ t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+ // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+ t[1] = vmull_n_s16(s[0], sinpi_3_9);
+ t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+ t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+ // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+ t[2] = vmull_n_s16(s[0], sinpi_4_9);
+ t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+ t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+ // t3 = s2 * sinpi_3_9
+ t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
/*
* u0 = t0 + t3
* u1 = t1
* u2 = t2 - t3
* u3 = t2 - t0 + t3
*/
- u0 = vaddq_s32(t0, t3);
- u1 = t1;
- u2 = vsubq_s32(t2, t3);
- u3 = vaddq_s32(vsubq_s32(t2, t0), t3);
+ u[0] = vaddq_s32(t[0], t[3]);
+ u[1] = t[1];
+ u[2] = vsubq_s32(t[2], t[3]);
+ u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
// fdct_round_shift
- u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING);
- u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING);
- u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING);
- u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING);
-
- out_0 = vshrn_n_s32(u0, DCT_CONST_BITS);
- out_1 = vshrn_n_s32(u1, DCT_CONST_BITS);
- out_2 = vshrn_n_s32(u2, DCT_CONST_BITS);
- out_3 = vshrn_n_s32(u3, DCT_CONST_BITS);
+ out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+ out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+ out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+ out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
- in[0] = vcombine_s16(out_0, out_1);
- in[1] = vcombine_s16(out_2, out_3);
+ in[0] = vcombine_s16(out[0], out[1]);
+ in[1] = vcombine_s16(out[2], out[3]);
}
void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
@@ -130,12 +114,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
case ADST_DCT:
load_buffer_4x4(input, in, stride);
fadst4x4_neon(in);
- vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
write_buffer_4x4(output, in);
break;
case DCT_ADST:
load_buffer_4x4(input, in, stride);
- vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
fadst4x4_neon(in);
write_buffer_4x4(output, in);
break;
@@ -235,245 +221,158 @@ static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
}
static INLINE void fadst8x8_neon(int16x8_t *in) {
- int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo,
- x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi;
- int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo,
- s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi;
- int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo,
- t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi;
- const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
-
- x0_lo = vget_low_s16(in[7]);
- x0_hi = vget_high_s16(in[7]);
- x1_lo = vget_low_s16(in[0]);
- x1_hi = vget_high_s16(in[0]);
- x2_lo = vget_low_s16(in[5]);
- x2_hi = vget_high_s16(in[5]);
- x3_lo = vget_low_s16(in[2]);
- x3_hi = vget_high_s16(in[2]);
- x4_lo = vget_low_s16(in[3]);
- x4_hi = vget_high_s16(in[3]);
- x5_lo = vget_low_s16(in[4]);
- x5_hi = vget_high_s16(in[4]);
- x6_lo = vget_low_s16(in[1]);
- x6_hi = vget_high_s16(in[1]);
- x7_lo = vget_low_s16(in[6]);
- x7_hi = vget_high_s16(in[6]);
+ int16x4_t x_lo[8], x_hi[8];
+ int32x4_t s_lo[8], s_hi[8];
+ int32x4_t t_lo[8], t_hi[8];
+
+ x_lo[0] = vget_low_s16(in[7]);
+ x_hi[0] = vget_high_s16(in[7]);
+ x_lo[1] = vget_low_s16(in[0]);
+ x_hi[1] = vget_high_s16(in[0]);
+ x_lo[2] = vget_low_s16(in[5]);
+ x_hi[2] = vget_high_s16(in[5]);
+ x_lo[3] = vget_low_s16(in[2]);
+ x_hi[3] = vget_high_s16(in[2]);
+ x_lo[4] = vget_low_s16(in[3]);
+ x_hi[4] = vget_high_s16(in[3]);
+ x_lo[5] = vget_low_s16(in[4]);
+ x_hi[5] = vget_high_s16(in[4]);
+ x_lo[6] = vget_low_s16(in[1]);
+ x_hi[6] = vget_high_s16(in[1]);
+ x_lo[7] = vget_low_s16(in[6]);
+ x_hi[7] = vget_high_s16(in[6]);
// stage 1
// s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64),
- vmull_n_s16(x1_lo, cospi_30_64));
- s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64),
- vmull_n_s16(x1_hi, cospi_30_64));
// s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64),
- vmull_n_s16(x1_lo, cospi_2_64));
- s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64),
- vmull_n_s16(x1_hi, cospi_2_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+ cospi_2_64, cospi_30_64, &s_lo[0],
+ &s_hi[0], &s_lo[1], &s_hi[1]);
+
// s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64),
- vmull_n_s16(x3_lo, cospi_22_64));
- s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64),
- vmull_n_s16(x3_hi, cospi_22_64));
// s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64),
- vmull_n_s16(x3_lo, cospi_10_64));
- s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64),
- vmull_n_s16(x3_hi, cospi_10_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+ cospi_10_64, cospi_22_64, &s_lo[2],
+ &s_hi[2], &s_lo[3], &s_hi[3]);
+
// s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64),
- vmull_n_s16(x5_lo, cospi_14_64));
- s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64),
- vmull_n_s16(x5_hi, cospi_14_64));
// s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64),
- vmull_n_s16(x5_lo, cospi_18_64));
- s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64),
- vmull_n_s16(x5_hi, cospi_18_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+ cospi_18_64, cospi_14_64, &s_lo[4],
+ &s_hi[4], &s_lo[5], &s_hi[5]);
+
// s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64),
- vmull_n_s16(x7_lo, cospi_6_64));
- s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64),
- vmull_n_s16(x7_hi, cospi_6_64));
// s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64),
- vmull_n_s16(x7_lo, cospi_26_64));
- s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64),
- vmull_n_s16(x7_hi, cospi_26_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+ cospi_26_64, cospi_6_64, &s_lo[6],
+ &s_hi[6], &s_lo[7], &s_hi[7]);
// fdct_round_shift
- t0_lo = vaddq_s32(s0_lo, s4_lo);
- t0_hi = vaddq_s32(s0_hi, s4_hi);
- t1_lo = vaddq_s32(s1_lo, s5_lo);
- t1_hi = vaddq_s32(s1_hi, s5_hi);
- t2_lo = vaddq_s32(s2_lo, s6_lo);
- t2_hi = vaddq_s32(s2_hi, s6_hi);
- t3_lo = vaddq_s32(s3_lo, s7_lo);
- t3_hi = vaddq_s32(s3_hi, s7_hi);
- t4_lo = vsubq_s32(s0_lo, s4_lo);
- t4_hi = vsubq_s32(s0_hi, s4_hi);
- t5_lo = vsubq_s32(s1_lo, s5_lo);
- t5_hi = vsubq_s32(s1_hi, s5_hi);
- t6_lo = vsubq_s32(s2_lo, s6_lo);
- t6_hi = vsubq_s32(s2_hi, s6_hi);
- t7_lo = vsubq_s32(s3_lo, s7_lo);
- t7_hi = vsubq_s32(s3_hi, s7_hi);
-
- t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING);
- t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING);
- t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING);
- t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING);
- t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING);
- t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING);
- t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING);
- t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING);
- t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
- t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
- t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
- t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
- t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
- t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
- t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
- t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
-
- t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS);
- t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS);
- t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS);
- t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS);
- t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS);
- t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS);
- t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS);
- t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS);
- t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
- t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
- t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
- t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
- t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
- t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
- t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
- t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+ t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+ t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+ t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+ t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+ t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+ t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+ t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+ t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+ t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
// stage 2
- s0_lo = t0_lo;
- s0_hi = t0_hi;
- s1_lo = t1_lo;
- s1_hi = t1_hi;
- s2_lo = t2_lo;
- s2_hi = t2_hi;
- s3_lo = t3_lo;
- s3_hi = t3_hi;
- s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64),
- vmulq_n_s32(t5_lo, cospi_24_64));
- s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64),
- vmulq_n_s32(t5_hi, cospi_24_64));
- s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64),
- vmulq_n_s32(t5_lo, cospi_8_64));
- s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64),
- vmulq_n_s32(t5_hi, cospi_8_64));
- s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64),
- vmulq_n_s32(t7_lo, cospi_8_64));
- s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64),
- vmulq_n_s32(t7_hi, cospi_8_64));
- s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64),
- vmulq_n_s32(t7_lo, cospi_24_64));
- s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64),
- vmulq_n_s32(t7_hi, cospi_24_64));
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+ cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+ &s_lo[5], &s_hi[5]);
+
+ // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+ -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+ &s_lo[7], &s_hi[7]);
+ // fdct_round_shift
// s0 + s2
- t0_lo = vaddq_s32(s0_lo, s2_lo);
- t0_hi = vaddq_s32(s0_hi, s2_hi);
+ t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+ t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
// s1 + s3
- t1_lo = vaddq_s32(s1_lo, s3_lo);
- t1_hi = vaddq_s32(s1_hi, s3_hi);
+ t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+ t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
// s0 - s2
- t2_lo = vsubq_s32(s0_lo, s2_lo);
- t2_hi = vsubq_s32(s0_hi, s2_hi);
+ t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+ t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
// s1 - s3
- t3_lo = vsubq_s32(s1_lo, s3_lo);
- t3_hi = vsubq_s32(s1_hi, s3_hi);
+ t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+ t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
// s4 + s6
- t4_lo = vaddq_s32(s4_lo, s6_lo);
- t4_hi = vaddq_s32(s4_hi, s6_hi);
+ t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
// s5 + s7
- t5_lo = vaddq_s32(s5_lo, s7_lo);
- t5_hi = vaddq_s32(s5_hi, s7_hi);
+ t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
// s4 - s6
- t6_lo = vsubq_s32(s4_lo, s6_lo);
- t6_hi = vsubq_s32(s4_hi, s6_hi);
+ t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
// s5 - s7
- t7_lo = vsubq_s32(s5_lo, s7_lo);
- t7_hi = vsubq_s32(s5_hi, s7_hi);
-
- // fdct_round_shift
- t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
- t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
- t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
- t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
- t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
- t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
- t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
- t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
- t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
- t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
- t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
- t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
- t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
- t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
- t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
- t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
// stage 3
// cospi_16_64 * (x2 + x3)
- s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64);
- s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64);
// cospi_16_64 * (x2 - x3)
- s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64);
- s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+ cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+ &s_hi[3]);
+
// cospi_16_64 * (x6 + x7)
- s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64);
- s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64);
// cospi_16_64 * (x2 - x3)
- s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64);
- s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+ cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+ &s_hi[7]);
// final fdct_round_shift
- t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING);
- t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING);
- t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING);
- t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING);
- t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING);
- t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING);
- t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING);
- t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING);
-
- x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS);
- x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS);
- x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS);
- x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS);
- x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS);
- x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS);
- x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS);
- x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS);
+ x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+ x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+ x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+ x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+ x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+ x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+ x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+ x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
// x0, x1, x4, x5 narrow down to 16-bits directly
- x0_lo = vmovn_s32(t0_lo);
- x0_hi = vmovn_s32(t0_hi);
- x1_lo = vmovn_s32(t1_lo);
- x1_hi = vmovn_s32(t1_hi);
- x4_lo = vmovn_s32(t4_lo);
- x4_hi = vmovn_s32(t4_hi);
- x5_lo = vmovn_s32(t5_lo);
- x5_hi = vmovn_s32(t5_hi);
-
- in[0] = vcombine_s16(x0_lo, x0_hi);
- in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi));
- in[2] = vcombine_s16(x6_lo, x6_hi);
- in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi));
- in[4] = vcombine_s16(x3_lo, x3_hi);
- in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi));
- in[6] = vcombine_s16(x5_lo, x5_hi);
- in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi));
+ x_lo[0] = vmovn_s32(t_lo[0]);
+ x_hi[0] = vmovn_s32(t_hi[0]);
+ x_lo[1] = vmovn_s32(t_lo[1]);
+ x_hi[1] = vmovn_s32(t_hi[1]);
+ x_lo[4] = vmovn_s32(t_lo[4]);
+ x_hi[4] = vmovn_s32(t_hi[4]);
+ x_lo[5] = vmovn_s32(t_lo[5]);
+ x_hi[5] = vmovn_s32(t_hi[5]);
+
+ in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+ in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+ in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+ in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+ in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+ in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+ in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+ in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
&in[7]);
@@ -488,13 +387,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
case ADST_DCT:
load_buffer_8x8(input, in, stride);
fadst8x8_neon(in);
- vpx_fdct8x8_pass1_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case DCT_ADST:
load_buffer_8x8(input, in, stride);
- vpx_fdct8x8_pass1_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
fadst8x8_neon(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
@@ -547,7 +448,6 @@ static void fdct16_8col(int16x8_t *in) {
int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
int16x4_t t_lo[8], t_hi[8];
int32x4_t u_lo[8], u_hi[8];
- const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
// stage 1
i[0] = vaddq_s16(in[0], in[15]);
@@ -559,7 +459,8 @@ static void fdct16_8col(int16x8_t *in) {
i[6] = vaddq_s16(in[6], in[9]);
i[7] = vaddq_s16(in[7], in[8]);
- vpx_fdct8x8_pass1_neon(i);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(i);
transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
// step 2
@@ -595,23 +496,14 @@ static void fdct16_8col(int16x8_t *in) {
u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
- u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
- u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
- u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
- u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
- u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
- u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
- u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
- u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-
- t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
- t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
- t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
- t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
- t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
- t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
- t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
- t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+ t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+ t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
@@ -646,40 +538,26 @@ static void fdct16_8col(int16x8_t *in) {
t_lo[7] = vget_low_s16(s3[7]);
t_hi[7] = vget_high_s16(s3[7]);
- u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64),
- vmull_n_s16(t_lo[6], cospi_24_64));
- u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64),
- vmull_n_s16(t_hi[6], cospi_24_64));
- u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64),
- vmull_n_s16(t_lo[5], cospi_8_64));
- u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64),
- vmull_n_s16(t_hi[5], cospi_8_64));
- u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64),
- vmull_n_s16(t_lo[5], -cospi_24_64));
- u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64),
- vmull_n_s16(t_hi[5], -cospi_24_64));
- u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64),
- vmull_n_s16(t_lo[6], cospi_8_64));
- u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64),
- vmull_n_s16(t_hi[6], cospi_8_64));
-
- u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
- u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
- u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
- u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
- u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
- u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
- u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
- u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
-
- t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
- t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
- t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
- t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
- t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
- t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
- t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
- t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+ // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+ // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+ butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+ -cospi_8_64, cospi_24_64, &u_lo[1],
+ &u_hi[1], &u_lo[6], &u_hi[6]);
+
+ // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+ // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2]
+ butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+ -cospi_24_64, cospi_8_64, &u_lo[5],
+ &u_hi[5], &u_lo[2], &u_hi[2]);
+
+ t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+ t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
@@ -714,88 +592,47 @@ static void fdct16_8col(int16x8_t *in) {
t_lo[7] = vget_low_s16(s1[7]);
t_hi[7] = vget_high_s16(s1[7]);
- // step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
- u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64),
- vmull_n_s16(t_lo[7], cospi_2_64));
- u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64),
- vmull_n_s16(t_hi[7], cospi_2_64));
-
- // step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64),
- vmull_n_s16(t_lo[6], cospi_18_64));
- u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64),
- vmull_n_s16(t_hi[6], cospi_18_64));
-
- // step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
- u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64),
- vmull_n_s16(t_lo[5], cospi_10_64));
- u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64),
- vmull_n_s16(t_hi[5], cospi_10_64));
-
- // step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64),
- vmull_n_s16(t_lo[4], cospi_26_64));
- u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64),
- vmull_n_s16(t_hi[4], cospi_26_64));
-
- // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
- u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64),
- vmull_n_s16(t_lo[4], cospi_6_64));
- u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64),
- vmull_n_s16(t_hi[4], cospi_6_64));
-
- // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64),
- vmull_n_s16(t_lo[5], cospi_22_64));
- u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64),
- vmull_n_s16(t_hi[5], cospi_22_64));
-
- // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
- u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64),
- vmull_n_s16(t_lo[6], cospi_14_64));
- u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64),
- vmull_n_s16(t_hi[6], cospi_14_64));
-
- // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64),
- vmull_n_s16(t_lo[7], cospi_30_64));
- u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64),
- vmull_n_s16(t_hi[7], cospi_30_64));
+ // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+ // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+ cospi_2_64, cospi_30_64, &u_lo[0],
+ &u_hi[0], &u_lo[7], &u_hi[7]);
+
+ // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+ // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+ cospi_18_64, cospi_14_64, &u_lo[1],
+ &u_hi[1], &u_lo[6], &u_hi[6]);
+
+ // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+ // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+ cospi_10_64, cospi_22_64, &u_lo[2],
+ &u_hi[2], &u_lo[5], &u_hi[5]);
+
+ // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+ // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+ cospi_26_64, cospi_6_64, &u_lo[3],
+ &u_hi[3], &u_lo[4], &u_hi[4]);
// final fdct_round_shift
- u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING);
- u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING);
- u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
- u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
- u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
- u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
- u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
- u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
- u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
- u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
- u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
- u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
- u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
- u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
- u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING);
- u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING);
-
- t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS);
- t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS);
- t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
- t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
- t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
- t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
- t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
- t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
- t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
- t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
- t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
- t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
- t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
- t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
- t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS);
- t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+ t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+ t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+ t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+ t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+ t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+ t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+ t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+ t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
in[0] = i[0];
in[2] = i[1];
@@ -820,7 +657,6 @@ static void fadst16_8col(int16x8_t *in) {
int16x4_t x_lo[16], x_hi[16];
int32x4_t s_lo[16], s_hi[16];
int32x4_t t_lo[16], t_hi[16];
- const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
x_lo[0] = vget_low_s16(in[15]);
x_hi[0] = vget_high_s16(in[15]);
@@ -857,185 +693,79 @@ static void fadst16_8col(int16x8_t *in) {
// stage 1
// s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
- s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64),
- vmull_n_s16(x_lo[1], cospi_31_64));
- s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64),
- vmull_n_s16(x_hi[1], cospi_31_64));
// s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
- s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64),
- vmull_n_s16(x_lo[1], cospi_1_64));
- s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64),
- vmull_n_s16(x_hi[1], cospi_1_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+ cospi_1_64, cospi_31_64, &s_lo[0],
+ &s_hi[0], &s_lo[1], &s_hi[1]);
// s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
- s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64),
- vmull_n_s16(x_lo[3], cospi_27_64));
- s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64),
- vmull_n_s16(x_hi[3], cospi_27_64));
// s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
- s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64),
- vmull_n_s16(x_lo[3], cospi_5_64));
- s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64),
- vmull_n_s16(x_hi[3], cospi_5_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+ cospi_5_64, cospi_27_64, &s_lo[2],
+ &s_hi[2], &s_lo[3], &s_hi[3]);
// s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
- s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64),
- vmull_n_s16(x_lo[5], cospi_23_64));
- s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64),
- vmull_n_s16(x_hi[5], cospi_23_64));
// s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
- s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64),
- vmull_n_s16(x_lo[5], cospi_9_64));
- s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64),
- vmull_n_s16(x_hi[5], cospi_9_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+ cospi_9_64, cospi_23_64, &s_lo[4],
+ &s_hi[4], &s_lo[5], &s_hi[5]);
// s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
- s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64),
- vmull_n_s16(x_lo[7], cospi_19_64));
- s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64),
- vmull_n_s16(x_hi[7], cospi_19_64));
// s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
- s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64),
- vmull_n_s16(x_lo[7], cospi_13_64));
- s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64),
- vmull_n_s16(x_hi[7], cospi_13_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+ cospi_13_64, cospi_19_64, &s_lo[6],
+ &s_hi[6], &s_lo[7], &s_hi[7]);
// s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
- s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64),
- vmull_n_s16(x_lo[9], cospi_15_64));
- s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64),
- vmull_n_s16(x_hi[9], cospi_15_64));
// s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
- s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64),
- vmull_n_s16(x_lo[9], cospi_17_64));
- s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64),
- vmull_n_s16(x_hi[9], cospi_17_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+ cospi_17_64, cospi_15_64, &s_lo[8],
+ &s_hi[8], &s_lo[9], &s_hi[9]);
// s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
- s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64),
- vmull_n_s16(x_lo[11], cospi_11_64));
- s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64),
- vmull_n_s16(x_hi[11], cospi_11_64));
// s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
- s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64),
- vmull_n_s16(x_lo[11], cospi_21_64));
- s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64),
- vmull_n_s16(x_hi[11], cospi_21_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+ cospi_21_64, cospi_11_64, &s_lo[10],
+ &s_hi[10], &s_lo[11], &s_hi[11]);
// s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
- s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64),
- vmull_n_s16(x_lo[13], cospi_7_64));
- s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64),
- vmull_n_s16(x_hi[13], cospi_7_64));
// s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
- s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64),
- vmull_n_s16(x_lo[13], cospi_25_64));
- s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64),
- vmull_n_s16(x_hi[13], cospi_25_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+ cospi_25_64, cospi_7_64, &s_lo[12],
+ &s_hi[12], &s_lo[13], &s_hi[13]);
// s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
- s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64),
- vmull_n_s16(x_lo[15], cospi_3_64));
- s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64),
- vmull_n_s16(x_hi[15], cospi_3_64));
// s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
- s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64),
- vmull_n_s16(x_lo[15], cospi_29_64));
- s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64),
- vmull_n_s16(x_hi[15], cospi_29_64));
+ butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+ cospi_29_64, cospi_3_64, &s_lo[14],
+ &s_hi[14], &s_lo[15], &s_hi[15]);
// fdct_round_shift
- t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]);
- t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]);
- t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]);
- t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]);
- t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]);
- t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]);
- t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]);
- t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]);
- t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]);
- t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]);
- t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]);
- t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]);
- t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]);
- t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]);
- t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]);
- t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]);
- t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]);
- t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]);
- t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]);
- t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]);
- t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]);
- t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]);
- t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]);
- t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]);
- t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]);
- t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]);
- t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]);
- t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]);
- t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]);
- t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]);
- t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]);
- t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]);
-
- t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING);
- t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING);
- t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING);
- t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING);
- t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING);
- t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING);
- t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING);
- t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING);
- t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
- t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
- t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
- t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
- t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
- t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
- t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
- t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
- t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
- t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
- t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
- t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
- t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
- t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
- t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
- t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
- t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
- t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
- t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
- t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
- t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
- t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
- t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
- t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-
- t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS);
- t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS);
- t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS);
- t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS);
- t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS);
- t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS);
- t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS);
- t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS);
- t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
- t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
- t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
- t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
- t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
- t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
- t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
- t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
- t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
- t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
- t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
- t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
- t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
- t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
- t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
- t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
- t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
- t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
- t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
- t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
- t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
- t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
- t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
- t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+ t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+ t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+ t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+ t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+ t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+ t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+ t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+ t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+ t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+ t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+ t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+ t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+ t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+ t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+ t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+ t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+ t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
// stage 2
s_lo[0] = t_lo[0];
@@ -1055,45 +785,25 @@ static void fadst16_8col(int16x8_t *in) {
s_lo[7] = t_lo[7];
s_hi[7] = t_hi[7];
// s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64),
- vmulq_n_s32(t_lo[9], cospi_28_64));
- s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64),
- vmulq_n_s32(t_hi[9], cospi_28_64));
// s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64),
- vmulq_n_s32(t_lo[9], cospi_4_64));
- s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64),
- vmulq_n_s32(t_hi[9], cospi_4_64));
+ butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+ cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+ &s_lo[9], &s_hi[9]);
// s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64),
- vmulq_n_s32(t_lo[11], cospi_12_64));
- s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64),
- vmulq_n_s32(t_hi[11], cospi_12_64));
// s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64),
- vmulq_n_s32(t_lo[11], cospi_20_64));
- s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64),
- vmulq_n_s32(t_hi[11], cospi_20_64));
+ butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+ cospi_20_64, cospi_12_64, &s_lo[10],
+ &s_hi[10], &s_lo[11], &s_hi[11]);
// s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64),
- vmulq_n_s32(t_lo[13], cospi_4_64));
- s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64),
- vmulq_n_s32(t_hi[13], cospi_4_64));
// s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64),
- vmulq_n_s32(t_lo[13], cospi_28_64));
- s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64),
- vmulq_n_s32(t_hi[13], cospi_28_64));
+ butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+ cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+ &s_lo[12], &s_hi[12]);
// s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64),
- vmulq_n_s32(t_lo[15], cospi_20_64));
- s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64),
- vmulq_n_s32(t_hi[15], cospi_20_64));
// s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
- s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64),
- vmulq_n_s32(t_lo[15], cospi_12_64));
- s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64),
- vmulq_n_s32(t_hi[15], cospi_12_64));
+ butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ cospi_12_64, cospi_20_64, &s_lo[15],
+ &s_hi[15], &s_lo[14], &s_hi[14]);
// s0 + s4
t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
@@ -1144,38 +854,22 @@ static void fadst16_8col(int16x8_t *in) {
t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
- t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
- t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
- t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
- t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
- t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
- t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
- t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
- t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
- t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
- t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
- t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
- t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
- t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
- t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
- t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
- t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
- t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
- t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
- t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
- t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
- t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
- t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
- t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
- t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
- t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
- t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
- t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
- t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
- t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
- t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
- t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
- t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+ t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+ t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+ t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+ t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+ t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+ t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+ t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+ t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
// stage 3
s_lo[0] = t_lo[0];
@@ -1187,25 +881,15 @@ static void fadst16_8col(int16x8_t *in) {
s_lo[3] = t_lo[3];
s_hi[3] = t_hi[3];
// s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64),
- vmulq_n_s32(t_lo[5], cospi_24_64));
- s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64),
- vmulq_n_s32(t_hi[5], cospi_24_64));
// s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64),
- vmulq_n_s32(t_lo[5], -cospi_8_64));
- s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64),
- vmulq_n_s32(t_hi[5], -cospi_8_64));
+ butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+ cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+ &s_lo[5], &s_hi[5]);
// s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64),
- vmulq_n_s32(t_lo[7], cospi_8_64));
- s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64),
- vmulq_n_s32(t_hi[7], cospi_8_64));
// s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64),
- vmulq_n_s32(t_lo[7], cospi_24_64));
- s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64),
- vmulq_n_s32(t_hi[7], cospi_24_64));
+ butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+ &s_lo[6], &s_hi[6]);
s_lo[8] = t_lo[8];
s_hi[8] = t_hi[8];
s_lo[9] = t_lo[9];
@@ -1215,25 +899,15 @@ static void fadst16_8col(int16x8_t *in) {
s_lo[11] = t_lo[11];
s_hi[11] = t_hi[11];
// s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64),
- vmulq_n_s32(t_lo[13], cospi_24_64));
- s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64),
- vmulq_n_s32(t_hi[13], cospi_24_64));
// s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64),
- vmulq_n_s32(t_lo[13], -cospi_8_64));
- s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64),
- vmulq_n_s32(t_hi[13], -cospi_8_64));
+ butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+ cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+ &s_lo[13], &s_hi[13]);
// s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64),
- vmulq_n_s32(t_lo[15], cospi_8_64));
- s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64),
- vmulq_n_s32(t_hi[15], cospi_8_64));
// s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
- s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64),
- vmulq_n_s32(t_lo[15], cospi_24_64));
- s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64),
- vmulq_n_s32(t_hi[15], cospi_24_64));
+ butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+ &s_lo[14], &s_hi[14]);
// s0 + s4
t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
@@ -1284,99 +958,62 @@ static void fadst16_8col(int16x8_t *in) {
t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
- t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
- t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
- t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
- t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
- t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
- t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
- t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
- t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
- t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
- t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
- t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
- t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
- t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
- t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
- t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
- t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
- t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
- t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
- t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
- t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
- t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
- t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
- t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
- t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
- t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
- t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
- t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
- t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
- t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
- t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
- t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
- t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+ t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
// stage 4
// s2 = (-cospi_16_64) * (x2 + x3);
- s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64);
- s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64);
// s3 = cospi_16_64 * (x2 - x3);
- s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64);
- s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+ -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+ &s_hi[3]);
// s6 = cospi_16_64 * (x6 + x7);
- s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64);
- s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64);
// s7 = cospi_16_64 * (-x6 + x7);
- s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64);
- s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+ &s_hi[7]);
// s10 = cospi_16_64 * (x10 + x11);
- s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64);
- s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64);
// s11 = cospi_16_64 * (-x10 + x11);
- s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64);
- s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+ cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+ &s_hi[11]);
// s14 = (-cospi_16_64) * (x14 + x15);
- s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64);
- s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64);
// s15 = cospi_16_64 * (x14 - x15);
- s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64);
- s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64);
+ butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+ &s_hi[15]);
// final fdct_round_shift
- t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING);
- t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING);
- t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING);
- t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING);
- t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING);
- t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING);
- t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING);
- t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING);
- t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING);
- t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING);
- t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING);
- t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING);
- t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING);
- t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING);
- t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING);
- t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING);
-
- x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS);
- x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS);
- x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS);
- x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS);
- x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS);
- x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS);
- x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS);
- x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS);
- x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS);
- x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS);
- x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS);
- x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS);
- x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS);
- x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS);
- x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS);
- x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS);
+ x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+ x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+ x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+ x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+ x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+ x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+ x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+ x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+ x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+ x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+ x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+ x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+ x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+ x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+ x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+ x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
// x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
x_lo[0] = vmovn_s32(t_lo[0]);
@@ -1458,3 +1095,137 @@ void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
break;
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+ int32x4_t *in /*[4]*/, int stride) {
+ // { 0, 1, 1, 1 };
+ const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+ // { 1, 0, 0, 0 };
+ const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+ int32x4_t mask;
+
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+ // one non-zero first elements
+ mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+ in[0] = vaddq_s32(in[0], mask);
+ in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+ const int32x4_t one = vdupq_n_s32(1);
+ res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+ res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+ res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+ res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+ vst1q_s32(output + 0 * 4, res[0]);
+ vst1q_s32(output + 1 * 4, res[1]);
+ vst1q_s32(output + 2 * 4, res[2]);
+ vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+ int32x2_t s_lo[4], s_hi[4];
+ int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+ s_lo[0] = vget_low_s32(in[0]);
+ s_hi[0] = vget_high_s32(in[0]);
+ s_lo[1] = vget_low_s32(in[1]);
+ s_hi[1] = vget_high_s32(in[1]);
+ s_lo[2] = vget_low_s32(in[2]);
+ s_hi[2] = vget_high_s32(in[2]);
+ s_lo[3] = vget_low_s32(in[3]);
+ s_hi[3] = vget_high_s32(in[3]);
+
+ // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+ t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+ t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+ t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+ t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+ t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+ t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+ // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+ t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+ t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+ t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+ t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+ t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+ t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+ // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+ t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+ t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+ t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+ t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+ t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+ t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+ // t3 = s2 * sinpi_3_9
+ t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+ t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+ /*
+ * u0 = t0 + t3
+ * u1 = t1
+ * u2 = t2 - t3
+ * u3 = t2 - t0 + t3
+ */
+ u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+ u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+ u_lo[1] = t_lo[1];
+ u_hi[1] = t_hi[1];
+ u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+ u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+ u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+ u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+ // fdct_round_shift
+ in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+ in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+ in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+ in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+ transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t in[4];
+ // int i;
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_4x4(input, in, stride);
+ highbd_fadst4x4_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_4x4(input, in, stride);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ highbd_fadst4x4_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_4x4(input, in, stride);
+ highbd_fadst4x4_neon(in);
+ highbd_fadst4x4_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 000000000..33753f77b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+ int_mv result;
+ result.as_mv.row = row;
+ result.as_mv.col = col;
+ return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+ // This is simplified from the C implementation to utilise that
+ // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
+ // x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+ return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
+ assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
+ return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
+ comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+ int sad_per_bit) {
+ const int_mv diff =
+ pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+ VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables, *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
+ * vp9_encoder.c. *
+ * For the joint cost: *
+ * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
+ * For the component costs: *
+ * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
+ * (Equal costs for both components) *
+ * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
+ * (Cost function is even) *
+ * If these do not hold, then this function cannot be used without *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties. *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+ const search_site_config *cfg, MV *ref_mv,
+ MV *best_mv, int search_param, int sad_per_bit,
+ int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ static const uint32_t data[4] = { 0, 1, 2, 3 };
+ const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+ const int32x4_t zero_s32 = vdupq_n_s32(0);
+ const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+ const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+ const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+ const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+ const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+ const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+ const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+ const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+ const int tot_steps = cfg->total_steps - search_param;
+
+ const int_mv fcenter_mv =
+ pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+ const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
+
+ const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+ const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+ int_mv bmv = pack_int_mv(ref_row, ref_col);
+ int_mv new_bmv = bmv;
+ int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+ const int what_stride = x->plane[0].src.stride;
+ const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+ const uint8_t *const what = x->plane[0].src.buf;
+ const uint8_t *const in_what =
+ x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+ // Work out the start point for the search
+ const uint8_t *best_address = in_what;
+ const uint8_t *new_best_address = best_address;
+#if defined(__aarch64__)
+ int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+ int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+ unsigned int best_sad = INT_MAX;
+ int i, j, step;
+
+ // Check the prerequisite cost function properties that are easy to check
+ // in an assert. See the function-level documentation for details on all
+ // prerequisites.
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+ // Check the starting position
+ best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+ best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+ *num00 = 0;
+
+ for (i = 0, step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+ int16x8_t v_diff_mv_w;
+ int8x16_t v_inside_d;
+ uint32x4_t v_outside_d;
+ int32x4_t v_cost_d, v_sad_d;
+#if defined(__aarch64__)
+ int64x2_t v_blocka[2];
+#else
+ int32x4_t v_blocka[1];
+ uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+ uint32_t horiz_max;
+ // Compute the candidate motion vectors
+ const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+ const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+ // Clamp them to the search bounds
+ int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+ v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+ v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+ // The ones that did not change are inside the search area
+ v_inside_d = vreinterpretq_s8_u32(
+ vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+ vreinterpretq_s32_s16(v_these_mv_w)));
+
+ // If none of them are inside, then move on
+#if defined(__aarch64__)
+ horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+ horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+ vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+ horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+ vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+ if (LIKELY(horiz_max == 0)) {
+ continue;
+ }
+
+ // The inverse mask indicates which of the MVs are outside
+ v_outside_d =
+ vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+ // Shift right to keep the sign bit clear, we will use this later
+ // to set the cost to the maximum value.
+ v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+ // Compute the difference MV
+ v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+ // We utilise the fact that the cost function is even, and use the
+ // absolute difference. This allows us to use unsigned indexes later
+ // and reduces cache pressure somewhat as only a half of the table
+ // is ever referenced.
+ v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+ // Compute the SIMD pointer offsets.
+ {
+#if defined(__aarch64__) // sizeof(intptr_t) == 8
+ // Load the offsets
+ int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+ int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+ // Set the ones falling outside to zero
+ v_bo10_q = vandq_s64(
+ v_bo10_q,
+ vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+ v_bo32_q = vandq_s64(
+ v_bo32_q,
+ vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+ // Compute the candidate addresses
+ v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+ v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else // sizeof(intptr_t) == 4
+ int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+ v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+ v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+ }
+
+ fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
+
+ // Look up the component cost of the residual motion vector
+ {
+ uint32_t cost[4];
+ int16_t __attribute__((aligned(16))) rowcol[8];
+ vst1q_s16(rowcol, v_diff_mv_w);
+
+ // Note: This is a use case for gather instruction
+ cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+ cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+ cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+ cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+ v_cost_d = vld1q_s32((int32_t *)cost);
+ }
+
+ // Now add in the joint cost
+ {
+ const uint32x4_t v_sel_d =
+ vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+ const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+ vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+ vreinterpretq_u8_s32(v_joint_cost_0_d),
+ vreinterpretq_u8_s32(v_joint_cost_1_d)));
+ v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+ }
+
+ // Multiply by sad_per_bit
+ v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+ // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+ v_cost_d =
+ vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+ v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+ // Add the cost to the sad
+ v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+ // Make the motion vectors outside the search area have max cost
+ // by or'ing in the comparison mask, this way the minimum search won't
+ // pick them.
+ v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+ // Find the minimum value and index horizontally in v_sad_d
+ {
+ uint32_t local_best_sad;
+#if defined(__aarch64__)
+ local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+ uint32x2_t horiz_min_0 =
+ vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+ vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+ uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+ vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+ // Update the global minimum if the local minimum is smaller
+ if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+ uint32_t local_best_idx;
+ const uint32x4_t v_sel_d =
+ vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+ uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+ v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if defined(__aarch64__)
+ local_best_idx = vminvq_u32(v_mask_d);
+#else
+ horiz_min_0 =
+ vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+ horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+ vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+ new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+ new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+ best_sad = local_best_sad;
+ }
+ }
+ }
+
+ bmv = new_bmv;
+ best_address = new_best_address;
+
+ v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if defined(__aarch64__)
+ v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+ v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+ if (UNLIKELY(best_address == in_what)) {
+ (*num00)++;
+ }
+ }
+
+ *best_mv = bmv.as_mv;
+ return best_sad;
+}
diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index e46f789ba..bc8dd4a34 100644
--- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -14,6 +14,7 @@
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_dsp/vpx_filter.h"
@@ -710,8 +711,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
const int dst_h = dst->y_crop_height;
- const int dst_uv_w = dst_w / 2;
- const int dst_uv_h = dst_h / 2;
+ const int dst_uv_w = dst->uv_crop_width;
+ const int dst_uv_h = dst->uv_crop_height;
int scaled = 0;
// phase_scaler is usually 0 or 8.
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 236c3176c..c2b55fcba 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,9 +26,8 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
-static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
- const int16x8_t dequant,
- tran_low_t *dqcoeff) {
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+ const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
const int32x4_t dqcoeff_0 =
vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
const int32x4_t dqcoeff_1 =
@@ -42,6 +41,82 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+ int16x8_t v_eobmax,
+ uint16x8_t v_nz_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif // __aarch64__
+}
+
+static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ int16x8_t *round, int16x8_t *quant,
+ int16x8_t *dequant) {
+ *round = vld1q_s16(round_ptr);
+ *quant = vld1q_s16(quant_ptr);
+ *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+ int16x8_t *v_quant,
+ int16x8_t *v_dequant) {
+#ifdef __aarch64__
+ *v_round = vdupq_laneq_s16(*v_round, 1);
+ *v_quant = vdupq_laneq_s16(*v_quant, 1);
+ *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+ *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+ *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+ *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+ const int16x8_t *v_round, const int16x8_t *v_quant,
+ const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ int16x8_t *v_eobmax) {
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+ *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
const int16_t *round_ptr, const int16_t *quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -50,136 +125,54 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
int i;
- const int16x8_t v_zero = vdupq_n_s16(0);
- const int16x8_t v_one = vdupq_n_s16(1);
- int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
- int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
- int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
- int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+ int16x8_t v_round, v_quant, v_dequant;
(void)scan;
- // adjust for dc
- v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
- v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
- v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
+ &v_dequant);
// process dc and the first seven ac coeffs
- {
- const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
- const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
- const int16x8_t v_abs = vabsq_s16(v_coeff);
- const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
- const int32x4_t v_tmp_lo =
- vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
- const int32x4_t v_tmp_hi =
- vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
- const int16x8_t v_tmp2 =
- vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
- const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
- const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
- const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
- const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
- calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
- v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
- store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
- v_round = vmovq_n_s16(round_ptr[1]);
- v_quant = vmovq_n_s16(quant_ptr[1]);
- v_dequant = vmovq_n_s16(dequant_ptr[1]);
- }
+ quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+ dqcoeff_ptr, &v_eobmax);
+
// now process the rest of the ac coeffs
+ update_fp_values(&v_round, &v_quant, &v_dequant);
for (i = 8; i < count; i += 8) {
- const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
- const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
- const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
- const int16x8_t v_abs = vabsq_s16(v_coeff);
- const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
- const int32x4_t v_tmp_lo =
- vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
- const int32x4_t v_tmp_hi =
- vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
- const int16x8_t v_tmp2 =
- vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
- const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
- const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
- const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
- const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
- calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
- v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
- store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+ quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+ qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
}
-#ifdef __aarch64__
- *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
- {
- const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
- vget_high_s16(v_eobmax_76543210));
- const int64x1_t v_eobmax_xx32 =
- vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
- const int16x4_t v_eobmax_tmp =
- vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
- const int64x1_t v_eobmax_xxx3 =
- vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
- const int16x4_t v_eobmax_final =
- vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
- *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
- }
-#endif // __aarch64__
+
+ *eob_ptr = get_max_eob(v_eobmax);
}
static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
}
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- const int16x8_t one = vdupq_n_s16(1);
- const int16x8_t neg_one = vdupq_n_s16(-1);
-
- // ROUND_POWER_OF_TWO(round_ptr[], 1)
- const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- const int16x8_t quant = vld1q_s16(quant_ptr);
- const int16x4_t dequant = vld1_s16(dequant_ptr);
- // dequant >> 2 is used similar to zbin as a threshold.
- const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+ const int16x8_t *v_round, const int16x8_t *v_quant,
+ const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+ const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_thr_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+ const int16x8_t v_tmp_rnd =
+ vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+ const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
- // Process dc and the first seven ac coeffs.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
- const int16x8_t dequant_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
- int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
int32x4_t dqcoeff_0, dqcoeff_1;
- uint16x8_t eob_max;
- (void)scan;
- (void)count;
-
- // coeff * quant_ptr[]) >> 15
- qcoeff = vqdmulhq_s16(qcoeff, quant);
-
- // Restore sign.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
- qcoeff = vandq_s16(qcoeff, dequant_mask);
-
- // qcoeff * dequant[] / 2
- dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
- dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
+ dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+ dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
// Add 1 if negative to round towards zero because the C uses division.
dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
#if CONFIG_VP9_HIGHBITDEPTH
vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
@@ -188,76 +181,228 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
vshrn_n_s32(dqcoeff_1, 1)));
#endif
- eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+ *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int16x8_t eob_max = vdupq_n_s16(-1);
+ // ROUND_POWER_OF_TWO(round_ptr[], 1)
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+ // dequant >> 2 is used similar to zbin as a threshold.
+ int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+ int i;
+
+ (void)scan;
+ (void)count;
+
+ // Process dc and the first seven ac coeffs.
+ quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+ iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ update_fp_values(&round, &quant, &dequant);
+ dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
iscan += 8;
coeff_ptr += 8;
qcoeff_ptr += 8;
dqcoeff_ptr += 8;
- {
- int i;
- const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
- const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
- const int16x8_t dequant_thresh =
- vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
-
- // Process the rest of the ac coeffs.
- for (i = 8; i < 32 * 32; i += 8) {
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
- const int16x8_t dequant_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
- int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
- int32x4_t dqcoeff_0, dqcoeff_1;
-
- qcoeff = vqdmulhq_s16(qcoeff, quant);
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
- qcoeff = vandq_s16(qcoeff, dequant_mask);
-
- dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
- dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
- dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
- dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+ // Process the rest of the ac coeffs.
+ for (i = 8; i < 32 * 32; i += 8) {
+ quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+ iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+ iscan += 8;
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
#if CONFIG_VP9_HIGHBITDEPTH
- vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
- vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
-#else
- store_s16q_to_tran_low(
- dqcoeff_ptr,
- vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
-#endif
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+ // const int abs_qcoeff = (int)((tmp * quant) >> 16);
+ const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Packed nz_qcoeff_mask. Used to find eob.
+ return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
- eob_max =
- vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_round = vld1_s16(round_ptr);
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ (void)scan;
- iscan += 8;
- coeff_ptr += 8;
- qcoeff_ptr += 8;
- dqcoeff_ptr += 8;
- }
+ // DC and first 3 AC
+ v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi =
+ highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ n_coeffs -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ v_mask_hi =
+ highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ n_coeffs -= 8;
+ } while (n_coeffs);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
-#ifdef __aarch64__
- *eob_ptr = vmaxvq_u16(eob_max);
-#else
- {
- const uint16x4_t eob_max_0 =
- vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
- const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
- const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
- vst1_lane_u16(eob_ptr, eob_max_2, 0);
- }
-#endif // __aarch64__
- }
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+ const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+ const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+ vreinterpretq_s32_u32(v_mask));
+ // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+ const int32x4_t v_abs_qcoeff =
+ vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Packed nz_qcoeff_mask. Used to find eob.
+ return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
}
+
+void vp9_highbd_quantize_fp_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const int16x4_t v_round =
+ vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ (void)scan;
+
+ // DC and first 3 AC
+ v_mask_lo =
+ highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi =
+ highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ n_coeffs -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo =
+ highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+ dqcoeff_ptr + 4, v_quant_s32,
+ v_dequant_s32, v_round_s32);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ n_coeffs -= 8;
+ } while (n_coeffs);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index e336179e9..28ab10a13 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -471,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
cr->sb_index = i;
cr->reduce_refresh = 0;
if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
- if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+ if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
}
// Set cyclic refresh parameters.
@@ -558,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
cr->percent_refresh = 10;
cr->rate_ratio_qdelta = 1.5;
cr->rate_boost_fac = 10;
- if (cpi->refresh_golden_frame == 1) {
+ if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
cr->percent_refresh = 0;
cr->rate_ratio_qdelta = 1.0;
}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 75bd097f2..a84c8b524 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
const TOKENEXTRA *p;
const vp9_extra_bit *const extra_bits =
#if CONFIG_VP9_HIGHBITDEPTH
- (bit_depth == VPX_BITS_12)
- ? vp9_extra_bits_high12
- : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits;
+ (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12
+ : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+ : vp9_extra_bits;
#else
vp9_extra_bits;
(void)bit_depth;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 2885223b5..77d72396a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
frame == ALTREF_FRAME ||
(frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
(frame != LAST_FRAME &&
- ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+ ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
denoiser->denoising_level >= kDenHigh))) {
frame = LAST_FRAME;
ctx->newmv_sse = ctx->zeromv_lastref_sse;
@@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
int temporal_layer_id) {
if (noise_level >= kDenLow && abs_sumdiff < 5)
- return threshold *=
- (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+ return threshold *= (noise_level == kDenLow) ? 2
+ : (temporal_layer_id == 2) ? 10
+ : 6;
else
return threshold;
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a9f392bf5..1483ac069 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1299,7 +1299,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
// the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
int is_key_frame =
(frame_is_intra_only(cm) ||
- (is_one_pass_cbr_svc(cpi) &&
+ (is_one_pass_svc(cpi) &&
cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
// Always use 4x4 partition for key frame.
const int use_4x4_partition = frame_is_intra_only(cm);
@@ -1406,7 +1406,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
assert(yv12 != NULL);
- if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+ if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
cpi->svc.use_gf_temporal_ref_current_layer) {
// For now, GOLDEN will not be used for non-zero spatial layers, since
// it may not be a temporal reference.
@@ -3413,7 +3413,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
const VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MODE_INFO *const mi = xd->mi[0];
- const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+ YV12_BUFFER_CONFIG *yv12;
+ YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
const int step_param = 1;
const MvLimits tmp_mv_limits = x->mv_limits;
const SEARCH_METHODS search_method = NSTEP;
@@ -3422,6 +3423,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
MV best_mv = { 0, 0 };
int cost_list[5];
+ if (scaled_ref_frame)
+ yv12 = scaled_ref_frame;
+ else
+ yv12 = get_ref_frame_buffer(cpi, ref);
+
assert(yv12 != NULL);
if (!yv12) return;
vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
@@ -5381,7 +5387,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
assert(yv12 != NULL);
- if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+ if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
cpi->svc.use_gf_temporal_ref_current_layer) {
// For now, GOLDEN will not be used for non-zero spatial layers, since
// it may not be a temporal reference.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index d3f4d1ea8..b66fdc0bc 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1333,7 +1333,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
// For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
// buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
// target of 1/4x1/4. number_spatial_layers must be greater than 2.
- if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+ if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
cpi->svc.number_spatial_layers > 2) {
cpi->svc.scaled_temp_is_alloc = 1;
if (vpx_realloc_frame_buffer(
@@ -1511,7 +1511,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
// Temporal scalability.
cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
- if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+ if ((cpi->svc.number_temporal_layers > 1) ||
((cpi->svc.number_temporal_layers > 1 ||
cpi->svc.number_spatial_layers > 1) &&
cpi->oxcf.pass != 1)) {
@@ -1527,6 +1527,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
init_buffer_indices(cpi);
vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+ cpi->fixed_qp_onepass = 0;
}
void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
@@ -2077,7 +2078,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
rc->rc_2_frame = 0;
}
- if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+ if ((cpi->svc.number_temporal_layers > 1) ||
((cpi->svc.number_temporal_layers > 1 ||
cpi->svc.number_spatial_layers > 1) &&
cpi->oxcf.pass != 1)) {
@@ -3263,7 +3264,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
vp9_denoiser_update_ref_frame(cpi);
#endif
- if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi);
+ if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
}
static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
@@ -3857,11 +3858,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
int q = 0, bottom_index = 0, top_index = 0;
int no_drop_scene_change = 0;
const INTERP_FILTER filter_scaler =
- (is_one_pass_cbr_svc(cpi))
+ (is_one_pass_svc(cpi))
? svc->downsample_filter_type[svc->spatial_layer_id]
: EIGHTTAP;
const int phase_scaler =
- (is_one_pass_cbr_svc(cpi))
+ (is_one_pass_svc(cpi))
? svc->downsample_filter_phase[svc->spatial_layer_id]
: 0;
@@ -3882,7 +3883,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
set_frame_size(cpi);
- if (is_one_pass_cbr_svc(cpi) &&
+ if (is_one_pass_svc(cpi) &&
cpi->un_scaled_source->y_width == cm->width << 2 &&
cpi->un_scaled_source->y_height == cm->height << 2 &&
svc->scaled_temp.y_width == cm->width << 1 &&
@@ -3896,7 +3897,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
svc->scaled_one_half = 1;
- } else if (is_one_pass_cbr_svc(cpi) &&
+ } else if (is_one_pass_svc(cpi) &&
cpi->un_scaled_source->y_width == cm->width << 1 &&
cpi->un_scaled_source->y_height == cm->height << 1 &&
svc->scaled_one_half) {
@@ -3911,7 +3912,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
}
#ifdef OUTPUT_YUV_SVC_SRC
// Write out at most 3 spatial layers.
- if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+ if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
}
#endif
@@ -4020,14 +4021,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
if (vp9_rc_drop_frame(cpi)) return 0;
}
- // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+ // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
// when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
// avoid this frame-level upsampling (for non intra_only frames).
// For SVC single_layer mode, dynamic resize is allowed and we need to
// scale references for this case.
if (frame_is_intra_only(cm) == 0 &&
((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
- !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
+ !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
vp9_scale_references(cpi);
}
@@ -4367,7 +4368,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
int frame_over_shoot_limit;
int frame_under_shoot_limit;
int q = 0, q_low = 0, q_high = 0;
- int last_q_attempt = 0;
int enable_acl;
#ifdef AGGRESSIVE_VBR
int qrange_adj = 1;
@@ -4381,8 +4381,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// Maximal frame size allowed by the external rate control.
// case: 0, we ignore the max frame size limit, and encode with the qindex
// passed in by the external rate control model.
- // case: -1, we take VP9's decision for the max frame size.
+ // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
+ // and may recode if undershoot/overshoot is seen.
+ // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
+ // case: -1, we take libvpx's decision for the max frame size, as well as
+ // the recode decision.
+ // Otherwise: if a specific size is given, libvpx's recode decision
+ // will respect the given size.
int ext_rc_max_frame_size = 0;
+ // Use VP9's decision of qindex. This flag is in use only in external rate
+ // control model to help determine whether to recode when
+ // |ext_rc_max_frame_size| is 0.
+ int ext_rc_use_default_q = 1;
const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
#if CONFIG_RATE_CTRL
@@ -4491,7 +4501,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
}
}
#endif // CONFIG_RATE_CTRL
- if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
+ if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
vpx_codec_err_t codec_status;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4500,16 +4511,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
const RefCntBuffer *curr_frame_buf =
get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+ // index 1 refers to the first encoding frame in a gf group.
+ // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+ // See function define_gf_group_structure().
+ const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
get_ref_frame_bufs(cpi, ref_frame_bufs);
codec_status = vp9_extrc_get_encodeframe_decision(
&cpi->ext_ratectrl, curr_frame_buf->frame_index,
cm->current_frame_coding_index, gf_group->index, update_type,
- ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
+ gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+ &encode_frame_decision);
if (codec_status != VPX_CODEC_OK) {
vpx_internal_error(&cm->error, codec_status,
"vp9_extrc_get_encodeframe_decision() failed");
}
- q = encode_frame_decision.q_index;
+ // If the external model recommends a reserved value, we use
+ // libvpx's default q.
+ if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+ q = encode_frame_decision.q_index;
+ ext_rc_use_default_q = 0;
+ }
ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
}
@@ -4551,8 +4573,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
}
- if (cpi->ext_ratectrl.ready) {
- last_q_attempt = q;
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
// In general, for the external rate control, we take the qindex provided
// as input and encode the frame with this qindex faithfully. However,
// in some extreme scenarios, the provided qindex leads to a massive
@@ -4560,20 +4582,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// to pick a new qindex and recode the frame. We return the new qindex
// through the API to the external model.
if (ext_rc_max_frame_size == 0) {
- break;
+ if (!ext_rc_use_default_q) break;
} else if (ext_rc_max_frame_size == -1) {
- if (rc->projected_frame_size < rc->max_frame_bandwidth) {
- break;
- }
+ // Do nothing, fall back to libvpx's recode decision.
} else {
- if (rc->projected_frame_size < ext_rc_max_frame_size) {
- break;
- }
+ // Change the max frame size, used in libvpx's recode decision.
+ rc->max_frame_bandwidth = ext_rc_max_frame_size;
}
- rc->max_frame_bandwidth = ext_rc_max_frame_size;
- // If the current frame size exceeds the ext_rc_max_frame_size,
- // we adjust the worst qindex to meet the frame size constraint.
- q_high = 255;
ext_rc_recode = 1;
}
#if CONFIG_RATE_CTRL
@@ -4776,23 +4791,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
rc->projected_frame_size < rc->max_frame_bandwidth)
loop = 0;
- // Special handling of external max frame size constraint
- if (ext_rc_recode) {
- // If the largest q is not able to meet the max frame size limit,
- // do nothing.
- if (rc->projected_frame_size > ext_rc_max_frame_size &&
- last_q_attempt == 255) {
- break;
- }
- // If VP9's q selection leads to a smaller q, we force it to use
- // a larger q to better approximate the external max frame size
- // constraint.
- if (rc->projected_frame_size > ext_rc_max_frame_size &&
- q <= last_q_attempt) {
- q = VPXMIN(255, last_q_attempt + 1);
- }
- }
-
if (loop) {
++loop_count;
++loop_at_this_size;
@@ -5518,6 +5516,32 @@ static void encode_frame_to_data_rate(
save_encode_params(cpi);
}
#endif
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+ vpx_codec_err_t codec_status;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+ const int ref_frame_flags = get_ref_frame_flags(cpi);
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+ const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+ // index 1 refers to the first encoding frame in a gf group.
+ // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+ // See function define_gf_group_structure().
+ const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+ int ext_rdmult = VPX_DEFAULT_RDMULT;
+ get_ref_frame_bufs(cpi, ref_frame_bufs);
+ codec_status = vp9_extrc_get_frame_rdmult(
+ &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+ cm->current_frame_coding_index, gf_group->index, update_type,
+ gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+ &ext_rdmult);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_frame_rdmult() failed");
+ }
+ cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
+ }
if (cpi->sf.recode_loop == DISALLOW_RECODE) {
if (!encode_without_recode_loop(cpi, size, dest)) return;
@@ -5593,7 +5617,7 @@ static void encode_frame_to_data_rate(
// build the bitstream
vp9_pack_bitstream(cpi, dest, size);
- {
+ if (cpi->ext_ratectrl.ready) {
const RefCntBuffer *coded_frame_buf =
get_ref_cnt_buffer(cm, cm->new_fb_idx);
vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
@@ -5800,16 +5824,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
unsigned int *frame_flags,
ENCODE_FRAME_RESULT *encode_frame_result) {
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-
- if (cpi->common.current_frame_coding_index == 0) {
- VP9_COMMON *cm = &cpi->common;
- const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
- &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
- if (codec_status != VPX_CODEC_OK) {
- vpx_internal_error(&cm->error, codec_status,
- "vp9_extrc_send_firstpass_stats() failed");
- }
- }
#if CONFIG_MISMATCH_DEBUG
mismatch_move_frame_idx_w();
#endif
@@ -7626,8 +7640,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
const int gf_group_index = cpi->twopass.gf_group.index;
int i;
- if (is_one_pass_cbr_svc(cpi)) {
- vp9_one_pass_cbr_svc_start_layer(cpi);
+ if (is_one_pass_svc(cpi)) {
+ vp9_one_pass_svc_start_layer(cpi);
}
vpx_usec_timer_start(&cmptimer);
@@ -7647,7 +7661,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
// Normal defaults
cm->reset_frame_context = 0;
cm->refresh_frame_context = 1;
- if (!is_one_pass_cbr_svc(cpi)) {
+ if (!is_one_pass_svc(cpi)) {
cpi->refresh_last_frame = 1;
cpi->refresh_golden_frame = 0;
cpi->refresh_alt_ref_frame = 0;
@@ -7780,7 +7794,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
adjust_frame_rate(cpi, source);
}
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
vp9_update_temporal_layer_framerate(cpi);
vp9_restore_layer_context(cpi);
}
@@ -7914,12 +7928,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
// Save layer specific state.
- if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
- cpi->svc.number_spatial_layers > 1) &&
- oxcf->pass == 2)) {
+ if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ oxcf->pass == 2)) {
vp9_save_layer_context(cpi);
}
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->fixed_qp_onepass = 0;
+
vpx_usec_timer_mark(&cmptimer);
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
@@ -7928,7 +7945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#if CONFIG_INTERNAL_STATS
- if (oxcf->pass != 1) {
+ if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
double samples = 0.0;
cpi->bytes += (int)(*size);
@@ -8090,7 +8107,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#endif
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
if (cm->show_frame) {
++cpi->svc.spatial_layer_to_encode;
if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
@@ -8159,9 +8176,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
unsigned int height) {
VP9_COMMON *cm = &cpi->common;
#if CONFIG_VP9_HIGHBITDEPTH
- update_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+ update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+ cpi->common.subsampling_y);
#else
- update_initial_width(cpi, 0, 1, 1);
+ update_initial_width(cpi, 0, cpi->common.subsampling_x,
+ cpi->common.subsampling_y);
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_TEMPORAL_DENOISING
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1d5894525..cca8b53f8 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -971,6 +971,8 @@ typedef struct VP9_COMP {
RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
#endif
EXT_RATECTRL ext_ratectrl;
+
+ int fixed_qp_onepass;
} VP9_COMP;
#if CONFIG_RATE_CTRL
@@ -1305,7 +1307,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
-static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
return (cpi->use_svc && cpi->oxcf.pass == 0);
}
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 9f0098ab5..1d440442b 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -137,19 +137,21 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
vpx_rc_encodeframe_decision_t *encode_frame_decision) {
if (ext_ratectrl == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
- if (ext_ratectrl->ready) {
+ if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
vpx_rc_status_t rc_status;
vpx_rc_encodeframe_info_t encode_frame_info;
encode_frame_info.show_index = show_index;
encode_frame_info.coding_index = coding_index;
encode_frame_info.gop_index = gop_index;
encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+ encode_frame_info.gop_size = gop_size;
+ encode_frame_info.use_alt_ref = use_alt_ref;
vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
encode_frame_info.ref_frame_coding_indexes,
@@ -198,3 +200,62 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
}
return VPX_CODEC_OK;
}
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+ EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ vpx_rc_status_t rc_status;
+ if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+ (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+ gop_info, gop_decision);
+ if (gop_decision->use_alt_ref) {
+ const int arf_constraint =
+ gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+ gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+ if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+ }
+ // TODO(chengchen): Take min and max gf interval from the model
+ // and overwrite libvpx's decision so that we can get rid
+ // of one of the checks here.
+ if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+ gop_info->max_gf_interval) {
+ return VPX_CODEC_ERROR;
+ }
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ int *rdmult) {
+ vpx_rc_status_t rc_status;
+ vpx_rc_encodeframe_info_t encode_frame_info;
+ if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+ (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ encode_frame_info.show_index = show_index;
+ encode_frame_info.coding_index = coding_index;
+ encode_frame_info.gop_index = gop_index;
+ encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+ encode_frame_info.gop_size = gop_size;
+ encode_frame_info.use_alt_ref = use_alt_ref;
+
+ vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+ encode_frame_info.ref_frame_coding_indexes,
+ encode_frame_info.ref_frame_valid_list);
+ rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+ &encode_frame_info, rdmult);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ return VPX_CODEC_OK;
+}
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 74fd68b96..7c3875883 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -16,6 +16,7 @@
typedef struct EXT_RATECTRL {
int ready;
+ int ext_rdmult;
vpx_rc_model_t model;
vpx_rc_funcs_t funcs;
vpx_rc_config_t ratectrl_config;
@@ -35,7 +36,7 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
vpx_rc_encodeframe_decision_t *encode_frame_decision);
@@ -45,4 +46,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
uint32_t input_bit_depth, const int actual_encoding_qindex);
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+ EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+ vpx_rc_gop_decision_t *gop_decision);
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ int *rdmult);
+
#endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 67302ed03..e9250e25c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2113,11 +2113,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
}
// Clamp odd edge cases.
- total_group_bits = (total_group_bits < 0)
- ? 0
- : (total_group_bits > twopass->kf_group_bits)
- ? twopass->kf_group_bits
- : total_group_bits;
+ total_group_bits = (total_group_bits < 0) ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
// Clip based on user supplied data rate variability limit.
if (total_group_bits > (int64_t)max_bits * gop_frames)
@@ -2714,6 +2713,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
// frame in which case it will already have been done.
if (is_key_frame == 0) {
vp9_zero(twopass->gf_group);
+ ++rc->gop_global_index;
+ } else {
+ rc->gop_global_index = 0;
}
vpx_clear_system_state();
@@ -2751,6 +2753,37 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
}
}
#endif
+ // If the external rate control model for GOP is used, the gop decisions
+ // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
+ // will be overwritten.
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+ vpx_codec_err_t codec_status;
+ vpx_rc_gop_decision_t gop_decision;
+ vpx_rc_gop_info_t gop_info;
+ gop_info.min_gf_interval = rc->min_gf_interval;
+ gop_info.max_gf_interval = rc->max_gf_interval;
+ gop_info.active_min_gf_interval = active_gf_interval.min;
+ gop_info.active_max_gf_interval = active_gf_interval.max;
+ gop_info.allow_alt_ref = allow_alt_ref;
+ gop_info.is_key_frame = is_key_frame;
+ gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
+ gop_info.frames_since_key = rc->frames_since_key;
+ gop_info.frames_to_key = rc->frames_to_key;
+ gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
+ gop_info.show_index = cm->current_video_frame;
+ gop_info.coding_index = cm->current_frame_coding_index;
+ gop_info.gop_global_index = rc->gop_global_index;
+
+ codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
+ &gop_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_gop_decision() failed");
+ }
+ gop_coding_frames = gop_decision.gop_coding_frames;
+ use_alt_ref = gop_decision.use_alt_ref;
+ }
// Was the group length constrained by the requirement for a new KF?
rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3461,6 +3494,16 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
FIRSTPASS_STATS this_frame;
const int show_idx = cm->current_video_frame;
+ if (cpi->common.current_frame_coding_index == 0) {
+ VP9_COMMON *cm = &cpi->common;
+ const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+ &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_send_firstpass_stats() failed");
+ }
+ }
+
if (!twopass->stats_in) return;
// Configure image size specific vizier parameters
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 697c589ab..579b466ca 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride,
#endif
uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
int i, j, k = 0;
+ uint32_t k_sqr = 0;
*sse = 0;
*sum = 0;
@@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride,
#endif
*sse += sse8x8[k];
*sum += sum8x8[k];
- var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+ k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+ var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
k++;
}
}
@@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
int i, j, k = 0;
+ uint32_t k_sqr = 0;
for (i = 0; i < nh; i += 2) {
for (j = 0; j < nw; j += 2) {
@@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
- var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
- (b_width_log2_lookup[unit_size] +
- b_height_log2_lookup[unit_size] + 6));
+ k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
k++;
}
}
@@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
unsigned int var8x8[64] = { 0 };
TX_SIZE tx_size;
int i, k;
+ uint32_t sum_sqr;
#if CONFIG_VP9_HIGHBITDEPTH
const vpx_bit_depth_t bd = cpi->common.bit_depth;
#endif
@@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
cpi->common.use_highbitdepth, bd,
#endif
sse8x8, sum8x8, var8x8);
- var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+ sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+ var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
*var_y = var;
*sse_y = sse;
@@ -1112,7 +1118,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
}
static INLINE void update_thresh_freq_fact_row_mt(
- VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+ VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
THR_MODES best_mode_idx, PREDICTION_MODE mode) {
THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
@@ -1627,9 +1633,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
return -1;
// Exit NEWMV search if base_mv_sse is large.
- if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+ if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
return -1;
- if (base_mv_sse < (best_sse_sofar << 1)) {
+ if ((base_mv_sse >> 1) < best_sse_sofar) {
// Base layer mv is good.
// Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
// (0, 0) mode is already tested.
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 9058997b0..dcc44449f 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -149,34 +149,6 @@ void vp9_highbd_quantize_fp_32x32_c(
}
#endif
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
- const int16_t *scan, const int16_t *iscan) {
- MACROBLOCKD *const xd = &x->e_mbd;
- struct macroblock_plane *p = &x->plane[plane];
- struct macroblockd_plane *pd = &xd->plane[plane];
- tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
- *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- const int n_coeffs = 4 * 4;
-
- if (x->skip_block) {
- memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
- memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
- return;
- }
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
- p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, &p->eobs[block], scan, iscan);
- return;
- }
-#endif
- vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
- p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- &p->eobs[block], scan, iscan);
-}
-
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
unsigned t;
int l, m;
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 2e6d7da2b..f626f0656 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,9 +37,6 @@ typedef struct {
DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
} QUANTS;
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
- const int16_t *scan, const int16_t *iscan);
-
struct VP9_COMP;
struct VP9Common;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 085297391..d9207f7a2 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -327,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi,
rc->buffer_level = rc->bits_off_target;
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
}
}
@@ -910,7 +910,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
active_worst_quality =
curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
} else {
- if (!rc->is_src_frame_alt_ref &&
+ if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
active_worst_quality =
curr_frame == 1
@@ -1871,7 +1871,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
}
}
} else {
- if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+ if ((cpi->use_svc) ||
(!rc->is_src_frame_alt_ref &&
!(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
rc->last_q[INTER_FRAME] = qindex;
@@ -2021,6 +2021,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
(rc->baseline_gf_interval + af_ratio - 1)
: ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
(rc->baseline_gf_interval + af_ratio - 1);
+ // For SVC: refresh flags are used to define the pattern, so we can't
+ // use that for boosting the target size here.
+ // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+ // For now just use the CBR logic for setting target size.
+ if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
if (target > INT_MAX) target = INT_MAX;
return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
}
@@ -2147,7 +2152,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
} else {
target = rc->avg_frame_bandwidth;
}
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
// Note that for layers, avg_frame_bandwidth is the cumulative
// per-frame-bandwidth. For the target size of this frame, use the
// layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2282,7 +2287,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
(svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
cm->frame_type = KEY_FRAME;
rc->source_alt_ref_active = 0;
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
svc->number_temporal_layers);
@@ -2290,11 +2295,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
// Assumption here is that LAST_FRAME is being updated for a keyframe.
// Thus no change in update flags.
- target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
}
} else {
cm->frame_type = INTER_FRAME;
- if (is_one_pass_cbr_svc(cpi)) {
+ if (is_one_pass_svc(cpi)) {
LAYER_CONTEXT *lc = &svc->layer_context[layer];
// Add condition current_video_frame > 0 for the case where first frame
// is intra only followed by overlay/copy frame. In this case we don't
@@ -2303,7 +2311,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
(svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
? 0
: svc->layer_context[svc->temporal_layer_id].is_key_frame;
- target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+ if (cpi->oxcf.rc_mode == VPX_CBR) {
+ target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+ } else {
+ double rate_err = 0.0;
+ rc->fac_active_worst_inter = 140;
+ rc->fac_active_worst_gf = 100;
+ if (rc->rolling_target_bits > 0) {
+ rate_err =
+ (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+ if (rate_err < 1.0)
+ rc->fac_active_worst_inter = 120;
+ else if (rate_err > 2.0)
+ // Increase active_worst faster if rate fluctuation is high.
+ rc->fac_active_worst_inter = 160;
+ }
+ target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+ }
}
}
@@ -2312,7 +2336,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
svc->layer_context[layer].is_key_frame == 1) {
cm->frame_type = KEY_FRAME;
cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
- target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
}
// Set the buffer idx and refresh flags for key frames in simulcast mode.
// Note the buffer slot for long-term reference is set below (line 2255),
@@ -2397,7 +2424,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
}
if (svc->set_intra_only_frame) {
set_intra_only_frame(cpi);
- target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
}
// Overlay frame predicts from LAST (intra-only)
if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
@@ -2584,13 +2614,12 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
const uint32_t pic_breadth =
VPXMAX(cpi->common.width, cpi->common.height);
int i;
- for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+ for (i = 0; i < VP9_LEVELS; ++i) {
if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
if (rc->min_gf_interval <=
(int)vp9_level_defs[i].min_altref_distance) {
- rc->min_gf_interval =
- (int)vp9_level_defs[i].min_altref_distance + 1;
+ rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
rc->max_gf_interval =
VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 83a12cde7..96a8fd3f1 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -211,6 +211,10 @@ typedef struct {
// Flag to constrain golden frame interval on key frame frequency for 1 pass
// VBR.
int constrain_gf_key_freq_onepass_vbr;
+
+ // The index of the current GOP. Start from zero.
+ // When a key frame is inserted, it resets to zero.
+ int gop_global_index;
} RATE_CONTROL;
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 9fa3ff186..58dd75b44 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -244,6 +244,12 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
// largest dc_quant is 21387, therefore rdmult should fit in int32_t
int rdmult = q * q;
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+ cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+ return cpi->ext_ratectrl.ext_rdmult;
+ }
+
// Make sure this function is floating point safe.
vpx_clear_system_state();
@@ -287,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+ cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+ return cpi->ext_ratectrl.ext_rdmult;
+ }
return modulate_rdmult(cpi, rdmult);
}
@@ -567,6 +578,12 @@ void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
}
}
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
const struct macroblockd_plane *pd,
ENTROPY_CONTEXT t_above[16],
@@ -604,6 +621,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
break;
}
}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0171a0572..a464ce38f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1108,6 +1108,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
xd->mi[0]->tx_size = TX_4X4;
+ assert(!x->skip_block);
+
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
@@ -1135,7 +1137,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
int16_t *const src_diff =
vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
- tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, 1, TX_4X4, mode,
x->skip_encode ? src : dst,
@@ -1148,7 +1153,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_highbd_fwht4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, so->scan, so->iscan);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1166,7 +1173,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
vpx_highbd_fdct4x4(src_diff, coeff, 8);
else
vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
- vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, so->scan, so->iscan);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
distortion += vp9_highbd_block_error_dispatch(
@@ -1236,7 +1245,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
int16_t *const src_diff =
vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
- tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride, dst,
@@ -1248,7 +1260,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_fwht4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1263,7 +1277,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_fht4x4(src_diff, coeff, 8, tx_type);
- vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1640,6 +1656,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
const int is_compound = has_second_ref(mi);
const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+ assert(!x->skip_block);
+
for (ref = 0; ref < 1 + is_compound; ++ref) {
const int bw = b_width_log2_lookup[BLOCK_8X8];
const int h = 4 * (i >> bw);
@@ -1701,18 +1719,27 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
#endif
int64_t ssz, rd, rd1, rd2;
- tran_low_t *coeff;
+ tran_low_t *coeff, *qcoeff, *dqcoeff;
+ uint16_t *eob;
int coeff_ctx;
k += (idy * 2 + idx);
coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
coeff = BLOCK_OFFSET(p->coeff, k);
+ qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+ dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+ eob = &p->eobs[k];
+
x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
- vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
thisdistortion += vp9_highbd_block_error_dispatch(
coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
#else
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
thisdistortion +=
vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -3242,6 +3269,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
// For two pass account for any formatting bars detected.
if (cpi->oxcf.pass == 2) {
TWO_PASS *twopass = &cpi->twopass;
+ vpx_clear_system_state();
// The inactive region is specified in MBs not mi units.
// The image edge is in the following MB row.
@@ -3269,6 +3297,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
// For two pass account for any formatting bars detected.
if (cpi->oxcf.pass == 2) {
TWO_PASS *twopass = &cpi->twopass;
+ vpx_clear_system_state();
// The inactive region is specified in MBs not mi units.
// The image edge is in the following MB row.
@@ -3470,7 +3499,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
}
mode_skip_mask[INTRA_FRAME] |=
- ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+ (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index a163297e6..d75488a8e 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
}
void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- seg->feature_mask[segment_id] &= ~(1 << feature_id);
+ seg->feature_mask[segment_id] &= ~(1u << feature_id);
}
void vp9_clear_segdata(struct segmentation *seg, int segment_id,
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index a57a70ab1..7e9435fb5 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -290,7 +290,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
}
static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
- if (is_one_pass_cbr_svc(cpi))
+ if (is_one_pass_svc(cpi))
return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
cpi->svc.number_temporal_layers +
cpi->svc.temporal_layer_id];
@@ -354,7 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
cpi->alt_ref_source = lc->alt_ref_source;
// Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
// does not use speed = 0).
- if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
+ if (is_one_pass_svc(cpi) && lc->speed > 0) {
cpi->oxcf.speed = lc->speed;
}
cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
@@ -754,7 +754,7 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
}
-int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
int width = 0, height = 0;
SVC *const svc = &cpi->svc;
LAYER_CONTEXT *lc = NULL;
@@ -894,6 +894,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
RATE_CONTROL *const lrc = &lc->rc;
lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+ if (cpi->fixed_qp_onepass) {
+ lrc->worst_quality = cpi->rc.worst_quality;
+ lrc->best_quality = cpi->rc.best_quality;
+ }
}
if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index b2d1d1b98..c7328cf57 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -255,7 +255,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
-int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 701bb8928..8af30c42a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
// is to weight all MBs equal.
- blk_fw[0] = err < (thresh_low << THR_SHIFT)
- ? 2
- : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+ blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2
+ : err < (thresh_high << THR_SHIFT) ? 1
+ : 0;
blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
} else {
use_32x32 = 0;
for (k = 0; k < 4; k++)
- blk_fw[k] = blk_bestsme[k] < thresh_low
- ? 2
- : blk_bestsme[k] < thresh_high ? 1 : 0;
+ blk_fw[k] = blk_bestsme[k] < thresh_low ? 2
+ : blk_bestsme[k] < thresh_high ? 1
+ : 0;
}
for (k = 0; k < 4; k++) {
diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index 4fa24512c..a7f5117cf 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -191,13 +191,11 @@ static INLINE void highbd_read_chroma_dist_row_8(
}
static void vp9_highbd_apply_temporal_filter_luma_8(
- const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
- int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
- int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
- int uv_pre_stride, unsigned int block_width, unsigned int block_height,
- int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
- uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
- const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_first,
const uint32_t *const *neighbors_second, int top_weight,
int bottom_weight) {
const int rounding = (1 << strength) >> 1;
@@ -256,17 +254,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
y_accum);
- y_src += y_src_stride;
y_pre += y_pre_stride;
y_count += y_pre_stride;
y_accum += y_pre_stride;
y_dist += DIST_STRIDE;
- u_src += uv_src_stride;
- u_pre += uv_pre_stride;
u_dist += DIST_STRIDE;
- v_src += uv_src_stride;
- v_pre += uv_pre_stride;
v_dist += DIST_STRIDE;
// Then all the rows except the last one
@@ -300,11 +293,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
&v_first, &v_second);
- u_src += uv_src_stride;
- u_pre += uv_pre_stride;
u_dist += DIST_STRIDE;
- v_src += uv_src_stride;
- v_pre += uv_pre_stride;
v_dist += DIST_STRIDE;
}
@@ -320,7 +309,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
y_accum);
- y_src += y_src_stride;
y_pre += y_pre_stride;
y_count += y_pre_stride;
y_accum += y_pre_stride;
@@ -364,13 +352,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
// Perform temporal filter for the luma component.
static void vp9_highbd_apply_temporal_filter_luma(
- const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
- int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
- int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
- int uv_pre_stride, unsigned int block_width, unsigned int block_height,
- int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
- uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
- const uint32_t *u_dist, const uint32_t *v_dist) {
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
unsigned int blk_col = 0, uv_blk_col = 0;
const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
const unsigned int mid_width = block_width >> 1,
@@ -384,9 +369,7 @@ static void vp9_highbd_apply_temporal_filter_luma(
neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
vp9_highbd_apply_temporal_filter_luma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
- v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -399,13 +382,10 @@ static void vp9_highbd_apply_temporal_filter_luma(
for (; blk_col < mid_width;
blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
vp9_highbd_apply_temporal_filter_luma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
- u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
- block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
- y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
- v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
- bottom_weight);
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
}
if (!use_whole_blk) {
@@ -417,21 +397,16 @@ static void vp9_highbd_apply_temporal_filter_luma(
for (; blk_col < last_width;
blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
vp9_highbd_apply_temporal_filter_luma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
- u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
- block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
- y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
- v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
- bottom_weight);
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
}
// Right
neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
vp9_highbd_apply_temporal_filter_luma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
- v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -491,13 +466,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
// else use top_weight for top half, and bottom weight for bottom half.
static void vp9_highbd_apply_temporal_filter_chroma_8(
- const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
- int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
- int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
- int uv_pre_stride, unsigned int uv_block_width,
- unsigned int uv_block_height, int ss_x, int ss_y, int strength,
- uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
- const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+ int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+ const uint32_t *u_dist, const uint32_t *v_dist,
const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
int top_weight, int bottom_weight, const int *blk_fw) {
const int rounding = (1 << strength) >> 1;
@@ -565,10 +538,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
v_accum);
- u_src += uv_src_stride;
u_pre += uv_pre_stride;
u_dist += DIST_STRIDE;
- v_src += uv_src_stride;
v_pre += uv_pre_stride;
v_dist += DIST_STRIDE;
u_count += uv_pre_stride;
@@ -576,8 +547,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
v_count += uv_pre_stride;
v_accum += uv_pre_stride;
- y_src += y_src_stride * (1 + ss_y);
- y_pre += y_pre_stride * (1 + ss_y);
y_dist += DIST_STRIDE * (1 + ss_y);
// Then all the rows except the last one
@@ -649,10 +618,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
v_accum);
- u_src += uv_src_stride;
u_pre += uv_pre_stride;
u_dist += DIST_STRIDE;
- v_src += uv_src_stride;
v_pre += uv_pre_stride;
v_dist += DIST_STRIDE;
u_count += uv_pre_stride;
@@ -660,8 +627,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
v_count += uv_pre_stride;
v_accum += uv_pre_stride;
- y_src += y_src_stride * (1 + ss_y);
- y_pre += y_pre_stride * (1 + ss_y);
y_dist += DIST_STRIDE * (1 + ss_y);
}
@@ -720,12 +685,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
// Perform temporal filter for the chroma components.
static void vp9_highbd_apply_temporal_filter_chroma(
- const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
- int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
- int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
- int uv_pre_stride, unsigned int block_width, unsigned int block_height,
- int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
- uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
const unsigned int uv_width = block_width >> ss_x,
uv_height = block_height >> ss_y;
@@ -755,8 +718,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
if (use_whole_blk) {
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -764,8 +725,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
} else {
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -789,13 +748,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
}
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
- v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
- strength, u_accum + uv_blk_col, u_count + uv_blk_col,
- v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
- u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
- top_weight, bottom_weight, NULL);
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
blk_col += blk_col_step;
uv_blk_col += uv_blk_col_step;
@@ -812,8 +769,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
for (; uv_blk_col < uv_mid_width;
blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -830,8 +785,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
for (; uv_blk_col < uv_last_width;
blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -849,13 +802,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
}
vp9_highbd_apply_temporal_filter_chroma_8(
- y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
- u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
- v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
- strength, u_accum + uv_blk_col, u_count + uv_blk_col,
- v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
- u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
- top_weight, bottom_weight, NULL);
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
}
void vp9_highbd_apply_temporal_filter_sse4_1(
@@ -929,14 +880,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1(
u_dist_ptr = u_dist + 1;
v_dist_ptr = v_dist + 1;
- vp9_highbd_apply_temporal_filter_luma(
- y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
- u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
- strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr,
- v_dist_ptr);
+ vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength,
+ blk_fw, use_whole_blk, y_accum, y_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
vp9_highbd_apply_temporal_filter_chroma(
- y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
y_dist_ptr, u_dist_ptr, v_dist_ptr);
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 2188903b1..e9943447f 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
__m128i in7 = _mm_add_epi16(in[0], in[1]);
@@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) {
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__const_0 = _mm_setzero_si128();
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
@@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) {
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
u[0] = _mm_unpacklo_epi16(in[15], in[0]);
u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index fcf50eb2a..0e04a2f41 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -76,9 +76,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
const MV *center_mv) {
const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
- const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+ const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
- const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+ const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
@@ -96,14 +96,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
const int_mv fcenter_mv =
pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
- const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+ const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
int_mv bmv = pack_int_mv(ref_row, ref_col);
int_mv new_bmv = bmv;
- __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+ __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
const int what_stride = x->plane[0].src.stride;
const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
@@ -300,7 +300,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
bmv = new_bmv;
best_address = new_best_address;
- v_bmv_w = _mm_set1_epi32(bmv.as_int);
+ v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
#if VPX_ARCH_X86_64
v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 7685e7bc3..bf0e8b121 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
const int dst_h = dst->y_crop_height;
- const int dst_uv_w = dst_w / 2;
- const int dst_uv_h = dst_h / 2;
+ const int dst_uv_w = dst->uv_crop_width;
+ const int dst_uv_h = dst->uv_crop_height;
int scaled = 0;
// phase_scaler is usually 0 or 8.
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index db18b1a7a..da285be8e 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -18,7 +18,7 @@
#include "vpx_dsp/x86/quantize_sse2.h"
// Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
const __m256i zero = _mm256_setzero_si256();
#if CONFIG_VP9_HIGHBITDEPTH
_mm256_storeu_si256((__m256i *)(a), zero);
@@ -28,22 +28,73 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
#endif
}
-static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
- __m256i *coeff256) {
- const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
- const __m256i zero256 = _mm256_setzero_si256();
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+ const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+ __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
#if CONFIG_VP9_HIGHBITDEPTH
- // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
- // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
- const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
- const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
#else
- const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
#endif
- const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
- // Add one to convert from indices to counts
- const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
- return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+ const __m256i eob_lo = eob256;
+ // Copy upper 128 to lower 128
+ const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+ __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+ __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const int32_t nzflag =
+ _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ } else {
+ store_zero_tran_low(qcoeff_ptr);
+ store_zero_tran_low(dqcoeff_ptr);
+ }
}
void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -51,10 +102,114 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- __m128i eob;
- __m256i round256, quant256, dequant256;
- __m256i eob256, thr256;
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_setzero_si256();
+
+ quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += 8 * 2;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += 8 * 2;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i abs_dqcoeff =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+ const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+ if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i abs_dqcoeff =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+ const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ } else {
+ store_zero_tran_low(qcoeff_ptr);
+ store_zero_tran_low(dqcoeff_ptr);
+ }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
(void)scan;
coeff_ptr += n_coeffs;
@@ -63,74 +218,223 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr += n_coeffs;
n_coeffs = -n_coeffs;
+ // Setup global values
+ load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_srli_epi16(dequant, 2);
+ quant = _mm256_slli_epi16(quant, 1);
{
- __m256i coeff256;
-
- // Setup global values
- {
- const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
- const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
- const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- round256 = _mm256_castsi128_si256(round);
- round256 = _mm256_permute4x64_epi64(round256, 0x54);
-
- quant256 = _mm256_castsi128_si256(quant);
- quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
-
- dequant256 = _mm256_castsi128_si256(dequant);
- dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
- }
-
- {
- __m256i qcoeff256;
- __m256i qtmp256;
- coeff256 = load_tran_low(coeff_ptr + n_coeffs);
- qcoeff256 = _mm256_abs_epi16(coeff256);
- qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
- qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
- qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
- store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
- coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
- store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
- }
-
- eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
- n_coeffs += 8 * 2;
+ const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+ round = _mm256_add_epi16(round, rnd);
+ round = _mm256_srai_epi16(round, 1);
}
- // remove dc constants
- dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
- quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
- round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask.
+ thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+ quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+ quantize_fp_32x32_16_no_nzflag(
+ &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+ qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+ n_coeffs += 8 * 2;
- thr256 = _mm256_srai_epi16(dequant256, 1);
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_permute2x128_si256(thr, thr, 0x31);
// AC only loop
while (n_coeffs < 0) {
- __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
- __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
- int32_t nzflag =
- _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
-
- if (nzflag) {
- __m256i qtmp256;
- qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
- qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
- qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
- store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
- coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
- store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
- eob256 = _mm256_max_epi16(
- eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
- } else {
- store_zero_tran_low(qcoeff_ptr + n_coeffs);
- store_zero_tran_low(dqcoeff_ptr + n_coeffs);
- }
+ quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
n_coeffs += 8 * 2;
}
- eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
- _mm256_extracti128_si256(eob256, 1));
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+ const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(v, zero);
+ const __m128i ac = _mm_unpackhi_epi16(v, zero);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+ const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+ __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = highbd_init_256(round_ptr);
+ *quant = highbd_init_256(quant_ptr);
+ *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+ const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+ *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i round, quant, dequant;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+
+ highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+
+ n_coeffs += step;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += step;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+ *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int step = 8;
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_srli_epi32(dequant, 2);
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+ quant = _mm256_slli_epi32(quant, 1);
+ round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+ highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+
+ n_coeffs += step;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ highbd_quantize_fp_32x32(
+ &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+ qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += step;
+ }
- *eob_ptr = accumulate_eob(eob);
+ *eob_ptr = get_max_eob(eob_max);
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 4bcadaa6a..c87723443 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -16,184 +16,110 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *round_ptr, const int16_t *quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- __m128i zero;
+ const __m128i zero = _mm_setzero_si128();
__m128i thr;
int nzflag;
- __m128i eob;
+ int index = 16;
__m128i round, quant, dequant;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
(void)scan;
- coeff_ptr += n_coeffs;
- iscan += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
-
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- }
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
- store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
- store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
- store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
- store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
+ // Reinsert signs.
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
thr = _mm_srai_epi16(dequant, 1);
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
- store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
- store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- } else {
- store_zero_tran_low(qcoeff_ptr + n_coeffs);
- store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
- store_zero_tran_low(dqcoeff_ptr + n_coeffs);
- store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
- }
- }
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
+ __m128i eob0;
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
}
- n_coeffs += 8 * 2;
- }
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
+ index += 16;
}
+
+ *eob_ptr = accumulate_eob(eob);
}
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 000000000..d35004e37
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i thr;
+ int nzflag;
+ int index = 16;
+ __m128i round, quant, dequant;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
+
+ (void)scan;
+
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ __m128i eob0;
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
+ }
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one_s16 = _mm_set1_epi16(1);
+ __m128i thr;
+ int nzflag;
+ int index = 16;
+ __m128i round, quant, dequant;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
+
+ (void)scan;
+
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+ // The 32x32 halves round.
+ round = _mm_add_epi16(round, one_s16);
+ round = _mm_srli_epi16(round, 1);
+
+ // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+ // upshift quant to account for this.
+ quant = _mm_slli_epi16(quant, 1);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ // Get the abs value of qcoeff again so we can use shifts for division.
+ qcoeff0 = _mm_abs_epi16(qcoeff0);
+ qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ // Divide by 2.
+ qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+ qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ thr = _mm_srai_epi16(dequant, 2);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ // Get the abs value of qcoeff again so we can use shifts for division.
+ qcoeff0 = _mm_abs_epi16(qcoeff0);
+ qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ // Divide by 2.
+ qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+ qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
+ }
+
+ if (nzflag) {
+ const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
deleted file mode 100644
index 680acfec6..000000000
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,178 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
- qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
-
- ; actual quantize loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
- mov r2, dequantmp
-%ifidn %1, fp_32x32
- pcmpeqw m5, m5
- psrlw m5, 15
- paddw m1, m5
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- mova m3, [r2q] ; m3 = dequant
- mov r3, qcoeffmp
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, fp_32x32
- psllw m2, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
-
- INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
- lea r5q, [r5q+ncoeffq*2]
- INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq
- INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
- LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i]
- LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i]
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpeqw m7, m7
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12
- STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; r4[i] = r3[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
- psrlw m0, m3, 2
-%else
- psrlw m0, m3, 1
-%endif
- STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12
- STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m7 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jz .accumulate_eob
-
-.ac_only_loop:
- LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i]
- LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i]
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
-
- pcmpgtw m7, m6, m0
- pcmpgtw m12, m11, m0
- pmovmskb r6d, m7
- pmovmskb r2d, m12
-
- or r6, r2
- jz .skip_iter
-
- pcmpeqw m7, m7
-
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12
- STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; r4[i] = r3[i] * q
- pmullw m13, m3 ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
- STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12
- STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m7 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jl .ac_only_loop
-
- jmp .accumulate_eob
-.skip_iter:
- STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
- STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
- STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
- STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
- add ncoeffq, mmsize
- jl .ac_only_loop
-
-.accumulate_eob:
- ; horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- pextrw r6, m8, 0
- mov [r2], r6w
- RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index f4d7f7e9e..02e50a857 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -158,6 +158,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
}
vp9_set_mb_mi(cm, cm->width, cm->height);
cm->frame_type = frame_params.frame_type;
+ // This is needed to ensure key frame does not get unset in rc_get_svc_params.
+ cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
cpi_->sf.use_nonrd_pick_mode = 1;
if (cpi_->svc.number_spatial_layers == 1 &&
@@ -205,12 +207,16 @@ int VP9RateControlRTC::GetLoopfilterLevel() const {
return lf->filter_level;
}
-signed char *VP9RateControlRTC::GetCyclicRefreshMap() const {
- return cpi_->cyclic_refresh->map;
-}
+bool VP9RateControlRTC::GetSegmentationData(
+ VP9SegmentationData *segmentation_data) const {
+ if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false;
-int *VP9RateControlRTC::GetDeltaQ() const {
- return cpi_->cyclic_refresh->qindex_delta;
+ segmentation_data->segmentation_map = cpi_->segmentation_map;
+ segmentation_data->segmentation_map_size =
+ cpi_->common.mi_cols * cpi_->common.mi_rows;
+ segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+ segmentation_data->delta_q_size = 3u;
+ return true;
}
void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index d2b9417ae..b209e4db6 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -58,6 +58,13 @@ struct VP9FrameParamsQpRTC {
int temporal_layer_id;
};
+struct VP9SegmentationData {
+ const uint8_t *segmentation_map;
+ size_t segmentation_map_size;
+ const int *delta_q;
+ size_t delta_q_size;
+};
+
// This interface allows using VP9 real-time rate control without initializing
// the encoder. To use this interface, you need to link with libvpxrc.a.
//
@@ -110,8 +117,7 @@ class VP9RateControlRTC {
// GetQP() needs to be called after ComputeQP() to get the latest QP
int GetQP() const;
int GetLoopfilterLevel() const;
- signed char *GetCyclicRefreshMap() const;
- int *GetDeltaQ() const;
+ bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
// Feedback to rate control with the size of current encoded frame
void PostEncodeUpdate(uint64_t encoded_frame_size);
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 654699e1b..f42912d35 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -744,10 +744,12 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index,
}
#define SET_STRUCT_VALUE(config, structure, ret, field) \
- if (strcmp(config.name, #field) == 0) { \
- structure->field = atoi(config.value); \
- ret = 1; \
- }
+ do { \
+ if (strcmp(config.name, #field) == 0) { \
+ structure->field = atoi(config.value); \
+ ret = 1; \
+ } \
+ } while (false)
static void UpdateEncodeConfig(const EncodeConfig &config,
VP9EncoderConfig *oxcf) {
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 05ac9e169..dee175dc0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -170,8 +170,8 @@ static vpx_codec_err_t update_error_state(
static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
const vpx_codec_enc_cfg_t *cfg,
const struct vp9_extracfg *extra_cfg) {
- RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available
- RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available
+ RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available
+ RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available
RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
RANGE_CHECK_HI(cfg, g_profile, 3);
@@ -1014,6 +1014,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+ if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0;
return update_extra_cfg(ctx, &extra_cfg);
}
@@ -1357,8 +1358,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
unsigned int lib_flags = 0;
YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
- int64_t dst_end_time_stamp =
- timebase_units_to_ticks(timestamp_ratio, pts + duration);
size_t size, cx_data_sz;
unsigned char *cx_data;
@@ -1369,6 +1368,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
if (img != NULL) {
+ const int64_t dst_end_time_stamp =
+ timebase_units_to_ticks(timestamp_ratio, pts + duration);
res = image2yuvconfig(img, &sd);
// Store the original flags in to the frame buffer. Will extract the
@@ -1405,6 +1406,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// compute first pass stats
if (img) {
int ret;
+ int64_t dst_end_time_stamp;
vpx_codec_cx_pkt_t fps_pkt;
ENCODE_FRAME_RESULT encode_frame_result;
vp9_init_encode_frame_result(&encode_frame_result);
@@ -1430,6 +1432,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
#endif // !CONFIG_REALTIME_ONLY
} else {
ENCODE_FRAME_RESULT encode_frame_result;
+ int64_t dst_end_time_stamp;
vp9_init_encode_frame_result(&encode_frame_result);
while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
@@ -1525,9 +1528,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
cx_data += size;
cx_data_sz -= size;
- if (is_one_pass_cbr_svc(cpi) &&
- (cpi->svc.spatial_layer_id ==
- cpi->svc.number_spatial_layers - 1)) {
+ if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id ==
+ cpi->svc.number_spatial_layers - 1)) {
// Encoded all spatial layers; exit loop.
break;
}
@@ -1950,6 +1952,24 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ VP9_COMP *const cpi = ctx->cpi;
+ const int qp = va_arg(args, int);
+ vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+ struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+ vpx_codec_err_t res;
+
+ if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM;
+
+ cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+ extra_cfg.aq_mode = 0;
+ cpi->fixed_qp_onepass = 1;
+
+ res = update_extra_cfg(ctx, &extra_cfg);
+ return res;
+}
+
static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP8_COPY_REFERENCE, ctrl_copy_reference },
@@ -2004,6 +2024,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
{ VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
{ VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
+ { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
// Getters
{ VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 3c42c7dfe..bdfe21793 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -334,7 +334,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
const uint8_t *data_start = data;
- const uint8_t *const data_end = data + data_sz;
vpx_codec_err_t res;
uint32_t frame_sizes[8];
int frame_count;
@@ -362,6 +361,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
// Decode in serial mode.
if (frame_count > 0) {
+ const uint8_t *const data_end = data + data_sz;
int i;
for (i = 0; i < frame_count; ++i) {
@@ -379,6 +379,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
data_start += frame_size;
}
} else {
+ const uint8_t *const data_end = data + data_sz;
while (data_start < data_end) {
const uint32_t frame_size = (uint32_t)(data_end - data_start);
const vpx_codec_err_t res =
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 92a7fddb9..9072628f2 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -111,8 +111,10 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
@@ -121,10 +123,6 @@ endif
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-ifeq ($(VPX_ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-endif
-
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index a61238cb1..e0b679fbb 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -757,6 +757,16 @@ enum vp8e_enc_control_id {
* Supported in codecs: VP8
*/
VP8E_SET_RTC_EXTERNAL_RATECTRL,
+
+ /*!\brief Codec control to set quantizer for the next frame.
+ *
+ * This will turn off cyclic refresh. Only applicable to 1-pass without
+ * spatial layers.
+ *
+ * Supported in codecs: VP9
+ *
+ */
+ VP9E_SET_QUANTIZER_ONE_PASS,
};
/*!\brief vpx 1-D scaling mode
@@ -1085,6 +1095,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
+#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
/*!\endcond */
/*! @} - end defgroup vp8_encoder */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 21254bb54..efaf5ef36 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -115,14 +115,14 @@ typedef int64_t vpx_codec_pts_t;
* support frame types that are codec specific (MPEG-1 D-frames for example)
*/
typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
/*!\brief frame can be dropped without affecting the stream (no future frame
* depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define VPX_FRAME_IS_DROPPABLE 0x2u
/*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define VPX_FRAME_IS_INVISIBLE 0x4u
/*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define VPX_FRAME_IS_FRAGMENT 0x8u
/*!\brief Error Resilient flags
*
@@ -132,12 +132,13 @@ typedef uint32_t vpx_codec_frame_flags_t;
*/
typedef uint32_t vpx_codec_er_flags_t;
/*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1u
/*!\brief The frame partitions are independently decodable by the bool decoder,
* meaning that partitions can be decoded even though earlier partitions have
* been lost. Note that intra prediction is still done over the partition
- * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+ * boundary.
+ * \note This is only supported by VP8.*/
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
/*!\brief Encoder output packet variants
*
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index a193e5595..3c5fc8cfc 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,27 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures.
*/
-#define VPX_EXT_RATECTRL_ABI_VERSION (1)
+#define VPX_EXT_RATECTRL_ABI_VERSION (6)
+
+/*!\brief The control type of the inference API.
+ * In VPX_RC_QP mode, the external rate control model determines the
+ * quantization parameter (QP) for each frame.
+ * In VPX_RC_GOP mode, the external rate control model determines the
+ * group of picture (GOP) of the video sequence.
+ * In VPX_RC_RDMULT mode, the external rate control model determines the
+ * rate-distortion multiplier (rdmult) for the current frame.
+ * In VPX_RC_GOP_QP mode, the external rate control model determines
+ * both the QP and the GOP.
+ * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines
+ * the QP, GOP and the rdmult.
+ */
+typedef enum vpx_rc_type {
+ VPX_RC_QP = 1 << 0,
+ VPX_RC_GOP = 1 << 1,
+ VPX_RC_RDMULT = 1 << 2,
+ VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP,
+ VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
+} vpx_rc_type_t;
/*!\brief Abstract rate control model handler
*
@@ -34,11 +54,27 @@ extern "C" {
*/
typedef void *vpx_rc_model_t;
+/*!\brief A reserved value for the q index.
+ * If the external rate control model returns this value,
+ * the encoder will use the default q selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_Q -1
+
+/*!\brief A reserved value for the rdmult.
+ * If the external rate control model returns this value,
+ * the encoder will use the default rdmult selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_RDMULT -1
+
/*!\brief Encode frame decision made by the external rate control model
*
* The encoder will receive the decision from the external rate control model
* through get_encodeframe_decision() defined in vpx_rc_funcs_t.
*
+ * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
+ *
* If max_frame_size = 0, the encoding ignores max frame size limit.
* If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
* If the encoded frame size is larger than max_frame_size, the frame is
@@ -67,7 +103,7 @@ typedef struct vpx_rc_encodeframe_info {
int show_index; /**< display index, starts from zero*/
int coding_index; /**< coding index, starts from zero*/
/*!
- * index in group of picture, starts from zero.
+ * index of the current frame in this group of picture, starts from zero.
*/
int gop_index;
int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
@@ -77,6 +113,14 @@ typedef struct vpx_rc_encodeframe_info {
* 1: Valid
*/
int ref_frame_valid_list[3];
+ /*!
+ * The length of the current GOP.
+ */
+ int gop_size;
+ /*!
+ * Whether the current GOP uses an alt ref.
+ */
+ int use_alt_ref;
} vpx_rc_encodeframe_info_t;
/*!\brief Frame coding result
@@ -258,6 +302,84 @@ typedef struct vpx_rc_config {
int frame_rate_den; /**< denominator of frame rate */
} vpx_rc_config_t;
+/*!\brief Information passed to the external rate control model to
+ * help make GOP decisions.
+ */
+typedef struct vpx_rc_gop_info {
+ /*!
+ * Minimum allowed gf interval, fixed for the whole clip.
+ * Note that it will be modified to match vp9's level constraints
+ * in the encoder.
+ * The level constraint is defined in vp9_encoder.c:
+ * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS].
+ */
+ int min_gf_interval;
+ /*!
+ * Maximum allowed gf interval, fixed for the whole clip.
+ */
+ int max_gf_interval;
+ /*!
+ * Minimum allowed gf interval for the current GOP, determined
+ * by the encoder.
+ */
+ int active_min_gf_interval;
+ /*!
+ * Maximum allowed gf interval for the current GOP, determined
+ * by the encoder.
+ */
+ int active_max_gf_interval;
+ /*!
+ * Whether to allow the use of alt ref, determined by the encoder.
+ * It is fixed for the entire encode.
+ * See function "is_altref_enabled" in vp9_encoder.h.
+ */
+ int allow_alt_ref;
+ /*!
+ * Is the current frame a key frame.
+ */
+ int is_key_frame;
+ /*!
+ * Does the previous gop use alt ref or not.
+ */
+ int last_gop_use_alt_ref;
+ /*!
+ * Current frame distance to the last keyframe, e.g., if Nth frame is a key,
+ * then the value of the N+1 th frame is 1.
+ */
+ int frames_since_key;
+ /*!
+ * Current frame distance to the next keyframe, e.g. if Nth frame is a key,
+ * then the value of frame N - 1 is 1.
+ */
+ int frames_to_key;
+ /*!
+ * Number of lookahead source frames.
+ */
+ int lag_in_frames;
+ /*!
+ * Display index (temporal stamp) of this frame in the whole clip,
+ * starts from zero.
+ */
+ int show_index;
+ /*!
+ * Coding index of this frame in the whole clip, starts from zero.
+ */
+ int coding_index;
+ /*!
+ * The index of the current gop, starts from zero, resets to zero
+ * when a keyframe is set.
+ */
+ int gop_global_index;
+} vpx_rc_gop_info_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * group of picture.
+ */
+typedef struct vpx_rc_gop_decision {
+ int gop_coding_frames; /**< The number of frames of this GOP */
+ int use_alt_ref; /**< Whether to use alt ref for this GOP */
+} vpx_rc_gop_decision_t;
+
/*!\brief Create an external rate control model callback prototype
*
* This callback is invoked by the encoder to create an external rate control
@@ -310,6 +432,32 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
vpx_rc_model_t rate_ctrl_model,
const vpx_rc_encodeframe_result_t *encode_frame_result);
+/*!\brief Get the GOP structure from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get GOP decisions from
+ * the external rate control model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ * \param[in] gop_info information collected from the encoder
+ * \param[out] gop_decision GOP decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+ vpx_rc_gop_decision_t *gop_decision);
+
+/*!\brief Get the frame rdmult from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get rdmult from
+ * the external rate control model.
+ *
+ * \param[in] rate_ctrl_model rate control model
+ * \param[in] frame_info information collected from the encoder
+ * \param[out] rdmult frame rate-distortion multiplier from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)(
+ vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info,
+ int *rdmult);
+
/*!\brief Delete the external rate control model callback prototype
*
* This callback is invoked by the encoder to delete the external rate control
@@ -328,6 +476,10 @@ typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
*/
typedef struct vpx_rc_funcs {
/*!
+ * The rate control type of this API.
+ */
+ vpx_rc_type_t rc_type;
+ /*!
* Create an external rate control model.
*/
vpx_rc_create_model_cb_fn_t create_model;
@@ -344,6 +496,14 @@ typedef struct vpx_rc_funcs {
*/
vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
/*!
+ * Get GOP decisions from the external rate control model.
+ */
+ vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
+ /*!
+ * Get rdmult for the frame from the external rate control model.
+ */
+ vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult;
+ /*!
* Delete the external rate control model.
*/
vpx_rc_delete_model_cb_fn_t delete_model;
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 67f43246a..a458ecaa4 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -35,22 +35,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp3[16];
// Left half.
- load(input, stride, temp0);
- cross_input(temp0, temp1, 0);
- vpx_fdct16x16_body(temp1, temp0);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp1);
+ vpx_fdct8x16_body(temp1, temp0);
// Right half.
- load(input + 8, stride, temp1);
- cross_input(temp1, temp2, 0);
- vpx_fdct16x16_body(temp2, temp1);
+ load_cross(input + 8, stride, temp1);
+ scale_input(temp1, temp2);
+ vpx_fdct8x16_body(temp2, temp1);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
- transpose_8x8(&temp0[0], &temp2[0]);
- transpose_8x8(&temp1[0], &temp2[8]);
+
+ transpose_s16_8x8_new(&temp0[0], &temp2[0]);
+ transpose_s16_8x8_new(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
- cross_input(temp2, temp3, 1);
- vpx_fdct16x16_body(temp3, temp2);
+ cross_input(temp2, temp3);
+ vpx_fdct8x16_body(temp3, temp2);
transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
&temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -61,12 +62,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
- transpose_8x8(&temp0[8], &temp1[0]);
+ transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);
partial_round_shift(temp1);
- cross_input(temp1, temp0, 1);
- vpx_fdct16x16_body(temp0, temp1);
+ cross_input(temp1, temp0);
+ vpx_fdct8x16_body(temp0, temp1);
transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
&temp1[5], &temp1[6], &temp1[7]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -74,5 +76,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output, temp1);
store(output + 8, temp1 + 8);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[16];
+ int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+ right3[16], right4[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // right half.
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+ highbd_partial_round_shift(left3, right3);
+ highbd_cross_input(left3, right3, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+
+ highbd_partial_round_shift(left4, right4);
+ highbd_cross_input(left4, right4, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+ store16_s32(output, left3);
+ output += 4;
+ store16_s32(output, right3);
+ output += 4;
+
+ store16_s32(output, left4);
+ output += 4;
+ store16_s32(output, right4);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
// __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 0dd21153f..43d820b6b 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -13,6 +13,8 @@
#include <arm_neon.h>
+#include "fdct_neon.h"
+
static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
b[0] = vld1q_s16(a);
a += stride;
@@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
// To maybe reduce register usage this could be combined with the load() step to
// get the first 4 and last 4 values, cross those, then load the middle 8 values
// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vshlq_n_s16(a[0], 2);
+ b[1] = vshlq_n_s16(a[1], 2);
+ b[2] = vshlq_n_s16(a[2], 2);
+ b[3] = vshlq_n_s16(a[3], 2);
+ b[4] = vshlq_n_s16(a[4], 2);
+ b[5] = vshlq_n_s16(a[5], 2);
+ b[6] = vshlq_n_s16(a[6], 2);
+ b[7] = vshlq_n_s16(a[7], 2);
+
+ b[8] = vshlq_n_s16(a[8], 2);
+ b[9] = vshlq_n_s16(a[9], 2);
+ b[10] = vshlq_n_s16(a[10], 2);
+ b[11] = vshlq_n_s16(a[11], 2);
+ b[12] = vshlq_n_s16(a[12], 2);
+ b[13] = vshlq_n_s16(a[13], 2);
+ b[14] = vshlq_n_s16(a[14], 2);
+ b[15] = vshlq_n_s16(a[15], 2);
+}
+
static INLINE void cross_input(const int16x8_t *a /*[16]*/,
- int16x8_t *b /*[16]*/, const int pass) {
- if (pass == 0) {
- b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
- b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
- b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
- b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
- b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
- b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
- b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
- b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
- b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
- b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
- b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
- b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
- b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
- b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
- b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
- b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
- } else {
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
- }
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+ b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+ b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
}
// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
@@ -135,84 +159,9 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
}
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t c, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t c0,
- const tran_coef_t c1, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
- int16x8_t *b /*[8]*/) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
-}
-
// Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
- int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
int16x8_t s[8];
int16x8_t x[4];
int16x8_t step[8];
@@ -237,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
- butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
- // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
- butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
// Stage 2
// Re-using source s5/s6
// s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
// s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
- butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
// Stage 3
x[0] = vaddq_s16(s[4], s[5]);
@@ -255,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
x[3] = vaddq_s16(s[7], s[6]);
// Stage 4
- // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
- // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
- butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
- // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
- // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
- butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
// step 2
// From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -272,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
// step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
// step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
- butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
- butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+ butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
// step 3
s[0] = vaddq_s16(in[8], s[3]);
@@ -286,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
s[7] = vaddq_s16(in[15], s[4]);
// step 4
- // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
- // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
- butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
// step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
- // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
- butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
// step 5
step[0] = vaddq_s16(s[0], s[1]);
@@ -305,23 +257,368 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
step[7] = vaddq_s16(s[7], s[6]);
// step 6
- // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
- // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
- // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
- // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
- // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
- // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
- // cospi_22_64)
- // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
- // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
- butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
&out[7]);
- butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
&out[15]);
- butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
&out[3]);
- butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
&out[11]);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+ int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+ int32x4_t *a_right /*[16]*/,
+ int32x4_t *b_left /*[16]*/,
+ int32x4_t *b_right /*[16]*/) {
+ b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+ b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+ b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+ b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+ b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+ b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+ b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+ b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+ b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+ b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+ b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+ b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+ b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+ b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+ b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+ b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+ b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ const int32x4_t one = vdupq_n_s32(1);
+ left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+ left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+ left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+ left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+ left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+ left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+ left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+ left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+ left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+ left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+ left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+ left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+ left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+ left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+ left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+ left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+ right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+ right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+ right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+ right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+ right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+ right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+ right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+ right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+ right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+ right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+ right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+ right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+ right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+ right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+ right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+ right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+ vst1q_s32(a, b[0]);
+ a += 16;
+ vst1q_s32(a, b[1]);
+ a += 16;
+ vst1q_s32(a, b[2]);
+ a += 16;
+ vst1q_s32(a, b[3]);
+ a += 16;
+ vst1q_s32(a, b[4]);
+ a += 16;
+ vst1q_s32(a, b[5]);
+ a += 16;
+ vst1q_s32(a, b[6]);
+ a += 16;
+ vst1q_s32(a, b[7]);
+ a += 16;
+ vst1q_s32(a, b[8]);
+ a += 16;
+ vst1q_s32(a, b[9]);
+ a += 16;
+ vst1q_s32(a, b[10]);
+ a += 16;
+ vst1q_s32(a, b[11]);
+ a += 16;
+ vst1q_s32(a, b[12]);
+ a += 16;
+ vst1q_s32(a, b[13]);
+ a += 16;
+ vst1q_s32(a, b[14]);
+ a += 16;
+ vst1q_s32(a, b[15]);
+}
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ int32x4_t sl[8];
+ int32x4_t sr[8];
+ int32x4_t xl[4];
+ int32x4_t xr[4];
+ int32x4_t inl[8];
+ int32x4_t inr[8];
+ int32x4_t stepl[8];
+ int32x4_t stepr[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // Copy values 8-15 as we're storing in-place
+ inl[0] = left[8];
+ inr[0] = right[8];
+ inl[1] = left[9];
+ inr[1] = right[9];
+ inl[2] = left[10];
+ inr[2] = right[10];
+ inl[3] = left[11];
+ inr[3] = right[11];
+ inl[4] = left[12];
+ inr[4] = right[12];
+ inl[5] = left[13];
+ inr[5] = right[13];
+ inl[6] = left[14];
+ inr[6] = right[14];
+ inl[7] = left[15];
+ inr[7] = right[15];
+
+ // fdct4(step, step);
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[8], &right[8]);
+
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[4], &right[4],
+ &left[12], &right[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+ &sr[6], &sl[5], &sr[5]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], sl[5]);
+ xr[0] = vaddq_s32(sr[4], sr[5]);
+ xl[1] = vsubq_s32(sl[4], sl[5]);
+ xr[1] = vsubq_s32(sr[4], sr[5]);
+ xl[2] = vsubq_s32(sl[7], sl[6]);
+ xr[2] = vsubq_s32(sr[7], sr[6]);
+ xl[3] = vaddq_s32(sl[7], sl[6]);
+ xr[3] = vaddq_s32(sr[7], sr[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[2], &right[2],
+ &left[14], &right[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[10], &right[10],
+ &left[6], &right[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+ &sl[5], &sr[5], &sl[2], &sr[2]);
+ butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+ &sl[4], &sr[4], &sl[3], &sr[3]);
+
+ // step 3
+ sl[0] = vaddq_s32(inl[0], sl[3]);
+ sr[0] = vaddq_s32(inr[0], sr[3]);
+ sl[1] = vaddq_s32(inl[1], sl[2]);
+ sr[1] = vaddq_s32(inr[1], sr[2]);
+ xl[0] = vsubq_s32(inl[1], sl[2]);
+ xr[0] = vsubq_s32(inr[1], sr[2]);
+ xl[1] = vsubq_s32(inl[0], sl[3]);
+ xr[1] = vsubq_s32(inr[0], sr[3]);
+ xl[2] = vsubq_s32(inl[7], sl[4]);
+ xr[2] = vsubq_s32(inr[7], sr[4]);
+ xl[3] = vsubq_s32(inl[6], sl[5]);
+ xr[3] = vsubq_s32(inr[6], sr[5]);
+ sl[6] = vaddq_s32(inl[6], sl[5]);
+ sr[6] = vaddq_s32(inr[6], sr[5]);
+ sl[7] = vaddq_s32(inl[7], sl[4]);
+ sr[7] = vaddq_s32(inr[7], sr[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+ cospi_24_64, &sl[6], &sr[6], &sl[1],
+ &sr[1]);
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+ cospi_8_64, &sl[2], &sr[2], &sl[5],
+ &sr[5]);
+
+ // step 5
+ stepl[0] = vaddq_s32(sl[0], sl[1]);
+ stepr[0] = vaddq_s32(sr[0], sr[1]);
+ stepl[1] = vsubq_s32(sl[0], sl[1]);
+ stepr[1] = vsubq_s32(sr[0], sr[1]);
+ stepl[2] = vaddq_s32(xl[1], sl[2]);
+ stepr[2] = vaddq_s32(xr[1], sr[2]);
+ stepl[3] = vsubq_s32(xl[1], sl[2]);
+ stepr[3] = vsubq_s32(xr[1], sr[2]);
+ stepl[4] = vsubq_s32(xl[2], sl[5]);
+ stepr[4] = vsubq_s32(xr[2], sr[5]);
+ stepl[5] = vaddq_s32(xl[2], sl[5]);
+ stepr[5] = vaddq_s32(xr[2], sr[5]);
+ stepl[6] = vsubq_s32(sl[7], sl[6]);
+ stepr[6] = vsubq_s32(sr[7], sr[6]);
+ stepl[7] = vaddq_s32(sl[7], sl[6]);
+ stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index de74e6630..d6818d2ec 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -15,6 +15,8 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
// Most gcc 4.9 distributions outside of Android do not generate correct code
// for this function.
@@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
#else
-#define LOAD_INCREMENT(src, stride, dest, index) \
- do { \
- dest[index] = vld1q_s16(src); \
- src += stride; \
- } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3) \
- do { \
- dest[index3] = vaddq_s16(src[index0], src[index1]); \
- } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1) \
- do { \
- src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
- } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
- const int16_t *a_end = a + 24 * stride;
- int16x8_t c[8];
-
- LOAD_INCREMENT(a, stride, b, 0);
- LOAD_INCREMENT(a, stride, b, 1);
- LOAD_INCREMENT(a, stride, b, 2);
- LOAD_INCREMENT(a, stride, b, 3);
- LOAD_INCREMENT(a, stride, b, 4);
- LOAD_INCREMENT(a, stride, b, 5);
- LOAD_INCREMENT(a, stride, b, 6);
- LOAD_INCREMENT(a, stride, b, 7);
-
- LOAD_INCREMENT(a_end, stride, b, 24);
- LOAD_INCREMENT(a_end, stride, b, 25);
- LOAD_INCREMENT(a_end, stride, b, 26);
- LOAD_INCREMENT(a_end, stride, b, 27);
- LOAD_INCREMENT(a_end, stride, b, 28);
- LOAD_INCREMENT(a_end, stride, b, 29);
- LOAD_INCREMENT(a_end, stride, b, 30);
- LOAD_INCREMENT(a_end, stride, b, 31);
-
- ADD_S16(b, 0, 31, c, 0);
- ADD_S16(b, 1, 30, c, 1);
- ADD_S16(b, 2, 29, c, 2);
- ADD_S16(b, 3, 28, c, 3);
- ADD_S16(b, 4, 27, c, 4);
- ADD_S16(b, 5, 26, c, 5);
- ADD_S16(b, 6, 25, c, 6);
- ADD_S16(b, 7, 24, c, 7);
-
- ADD_SHIFT_S16(b, 7, 24);
- ADD_SHIFT_S16(b, 6, 25);
- ADD_SHIFT_S16(b, 5, 26);
- ADD_SHIFT_S16(b, 4, 27);
- ADD_SHIFT_S16(b, 3, 28);
- ADD_SHIFT_S16(b, 2, 29);
- ADD_SHIFT_S16(b, 1, 30);
- ADD_SHIFT_S16(b, 0, 31);
-
- b[0] = vshlq_n_s16(c[0], 2);
- b[1] = vshlq_n_s16(c[1], 2);
- b[2] = vshlq_n_s16(c[2], 2);
- b[3] = vshlq_n_s16(c[3], 2);
- b[4] = vshlq_n_s16(c[4], 2);
- b[5] = vshlq_n_s16(c[5], 2);
- b[6] = vshlq_n_s16(c[6], 2);
- b[7] = vshlq_n_s16(c[7], 2);
-
- LOAD_INCREMENT(a, stride, b, 8);
- LOAD_INCREMENT(a, stride, b, 9);
- LOAD_INCREMENT(a, stride, b, 10);
- LOAD_INCREMENT(a, stride, b, 11);
- LOAD_INCREMENT(a, stride, b, 12);
- LOAD_INCREMENT(a, stride, b, 13);
- LOAD_INCREMENT(a, stride, b, 14);
- LOAD_INCREMENT(a, stride, b, 15);
- LOAD_INCREMENT(a, stride, b, 16);
- LOAD_INCREMENT(a, stride, b, 17);
- LOAD_INCREMENT(a, stride, b, 18);
- LOAD_INCREMENT(a, stride, b, 19);
- LOAD_INCREMENT(a, stride, b, 20);
- LOAD_INCREMENT(a, stride, b, 21);
- LOAD_INCREMENT(a, stride, b, 22);
- LOAD_INCREMENT(a, stride, b, 23);
-
- ADD_S16(b, 8, 23, c, 0);
- ADD_S16(b, 9, 22, c, 1);
- ADD_S16(b, 10, 21, c, 2);
- ADD_S16(b, 11, 20, c, 3);
- ADD_S16(b, 12, 19, c, 4);
- ADD_S16(b, 13, 18, c, 5);
- ADD_S16(b, 14, 17, c, 6);
- ADD_S16(b, 15, 16, c, 7);
-
- ADD_SHIFT_S16(b, 15, 16);
- ADD_SHIFT_S16(b, 14, 17);
- ADD_SHIFT_S16(b, 13, 18);
- ADD_SHIFT_S16(b, 12, 19);
- ADD_SHIFT_S16(b, 11, 20);
- ADD_SHIFT_S16(b, 10, 21);
- ADD_SHIFT_S16(b, 9, 22);
- ADD_SHIFT_S16(b, 8, 23);
-
- b[8] = vshlq_n_s16(c[0], 2);
- b[9] = vshlq_n_s16(c[1], 2);
- b[10] = vshlq_n_s16(c[2], 2);
- b[11] = vshlq_n_s16(c[3], 2);
- b[12] = vshlq_n_s16(c[4], 2);
- b[13] = vshlq_n_s16(c[5], 2);
- b[14] = vshlq_n_s16(c[6], 2);
- b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest) \
- do { \
- store_s16q_to_tran_low(dest, src[index]); \
- dest += 8; \
- } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
- STORE_S16(b, 0, a);
- STORE_S16(b, 8, a);
- STORE_S16(b, 16, a);
- STORE_S16(b, 24, a);
- STORE_S16(b, 1, a);
- STORE_S16(b, 9, a);
- STORE_S16(b, 17, a);
- STORE_S16(b, 25, a);
- STORE_S16(b, 2, a);
- STORE_S16(b, 10, a);
- STORE_S16(b, 18, a);
- STORE_S16(b, 26, a);
- STORE_S16(b, 3, a);
- STORE_S16(b, 11, a);
- STORE_S16(b, 19, a);
- STORE_S16(b, 27, a);
- STORE_S16(b, 4, a);
- STORE_S16(b, 12, a);
- STORE_S16(b, 20, a);
- STORE_S16(b, 28, a);
- STORE_S16(b, 5, a);
- STORE_S16(b, 13, a);
- STORE_S16(b, 21, a);
- STORE_S16(b, 29, a);
- STORE_S16(b, 6, a);
- STORE_S16(b, 14, a);
- STORE_S16(b, 22, a);
- STORE_S16(b, 30, a);
- STORE_S16(b, 7, a);
- STORE_S16(b, 15, a);
- STORE_S16(b, 23, a);
- STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t constant,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t constant0,
- const tran_coef_t constant1,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
- const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
- const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
- const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
- return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
-}
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1: Done as part of the load.
-
- // Stage 2.
- // Mini cross. X the first 16 values and the middle 8 of the second half.
- a[0] = vaddq_s16(in[0], in[15]);
- a[1] = vaddq_s16(in[1], in[14]);
- a[2] = vaddq_s16(in[2], in[13]);
- a[3] = vaddq_s16(in[3], in[12]);
- a[4] = vaddq_s16(in[4], in[11]);
- a[5] = vaddq_s16(in[5], in[10]);
- a[6] = vaddq_s16(in[6], in[9]);
- a[7] = vaddq_s16(in[7], in[8]);
-
- a[8] = vsubq_s16(in[7], in[8]);
- a[9] = vsubq_s16(in[6], in[9]);
- a[10] = vsubq_s16(in[5], in[10]);
- a[11] = vsubq_s16(in[4], in[11]);
- a[12] = vsubq_s16(in[3], in[12]);
- a[13] = vsubq_s16(in[2], in[13]);
- a[14] = vsubq_s16(in[1], in[14]);
- a[15] = vsubq_s16(in[0], in[15]);
-
- a[16] = in[16];
- a[17] = in[17];
- a[18] = in[18];
- a[19] = in[19];
-
- butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
- butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
- butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
- butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
- a[28] = in[28];
- a[29] = in[29];
- a[30] = in[30];
- a[31] = in[31];
-
- // Stage 3.
- b[0] = vaddq_s16(a[0], a[7]);
- b[1] = vaddq_s16(a[1], a[6]);
- b[2] = vaddq_s16(a[2], a[5]);
- b[3] = vaddq_s16(a[3], a[4]);
-
- b[4] = vsubq_s16(a[3], a[4]);
- b[5] = vsubq_s16(a[2], a[5]);
- b[6] = vsubq_s16(a[1], a[6]);
- b[7] = vsubq_s16(a[0], a[7]);
-
- b[8] = a[8];
- b[9] = a[9];
-
- butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
- butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
- b[14] = a[14];
- b[15] = a[15];
-
- b[16] = vaddq_s16(in[16], a[23]);
- b[17] = vaddq_s16(in[17], a[22]);
- b[18] = vaddq_s16(in[18], a[21]);
- b[19] = vaddq_s16(in[19], a[20]);
-
- b[20] = vsubq_s16(in[19], a[20]);
- b[21] = vsubq_s16(in[18], a[21]);
- b[22] = vsubq_s16(in[17], a[22]);
- b[23] = vsubq_s16(in[16], a[23]);
-
- b[24] = vsubq_s16(in[31], a[24]);
- b[25] = vsubq_s16(in[30], a[25]);
- b[26] = vsubq_s16(in[29], a[26]);
- b[27] = vsubq_s16(in[28], a[27]);
-
- b[28] = vaddq_s16(in[28], a[27]);
- b[29] = vaddq_s16(in[29], a[26]);
- b[30] = vaddq_s16(in[30], a[25]);
- b[31] = vaddq_s16(in[31], a[24]);
-
- // Stage 4.
- a[0] = vaddq_s16(b[0], b[3]);
- a[1] = vaddq_s16(b[1], b[2]);
- a[2] = vsubq_s16(b[1], b[2]);
- a[3] = vsubq_s16(b[0], b[3]);
-
- a[4] = b[4];
-
- butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
- a[7] = b[7];
-
- a[8] = vaddq_s16(b[8], b[11]);
- a[9] = vaddq_s16(b[9], b[10]);
- a[10] = vsubq_s16(b[9], b[10]);
- a[11] = vsubq_s16(b[8], b[11]);
- a[12] = vsubq_s16(b[15], b[12]);
- a[13] = vsubq_s16(b[14], b[13]);
- a[14] = vaddq_s16(b[14], b[13]);
- a[15] = vaddq_s16(b[15], b[12]);
-
- a[16] = b[16];
- a[17] = b[17];
-
- butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
- butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
- butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
- butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
- a[22] = b[22];
- a[23] = b[23];
- a[24] = b[24];
- a[25] = b[25];
-
- a[30] = b[30];
- a[31] = b[31];
-
- // Stage 5.
- butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
- butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
- b[4] = vaddq_s16(a[4], a[5]);
- b[5] = vsubq_s16(a[4], a[5]);
- b[6] = vsubq_s16(a[7], a[6]);
- b[7] = vaddq_s16(a[7], a[6]);
-
- b[8] = a[8];
-
- butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
- butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
- b[11] = a[11];
- b[12] = a[12];
-
- b[15] = a[15];
-
- b[16] = vaddq_s16(a[19], a[16]);
- b[17] = vaddq_s16(a[18], a[17]);
- b[18] = vsubq_s16(a[17], a[18]);
- b[19] = vsubq_s16(a[16], a[19]);
- b[20] = vsubq_s16(a[23], a[20]);
- b[21] = vsubq_s16(a[22], a[21]);
- b[22] = vaddq_s16(a[21], a[22]);
- b[23] = vaddq_s16(a[20], a[23]);
- b[24] = vaddq_s16(a[27], a[24]);
- b[25] = vaddq_s16(a[26], a[25]);
- b[26] = vsubq_s16(a[25], a[26]);
- b[27] = vsubq_s16(a[24], a[27]);
- b[28] = vsubq_s16(a[31], a[28]);
- b[29] = vsubq_s16(a[30], a[29]);
- b[30] = vaddq_s16(a[29], a[30]);
- b[31] = vaddq_s16(a[28], a[31]);
-
- // Stage 6.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
-
- butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
- butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
- a[8] = vaddq_s16(b[8], b[9]);
- a[9] = vsubq_s16(b[8], b[9]);
- a[10] = vsubq_s16(b[11], b[10]);
- a[11] = vaddq_s16(b[11], b[10]);
- a[12] = vaddq_s16(b[12], b[13]);
- a[13] = vsubq_s16(b[12], b[13]);
- a[14] = vsubq_s16(b[15], b[14]);
- a[15] = vaddq_s16(b[15], b[14]);
-
- a[16] = b[16];
- a[19] = b[19];
- a[20] = b[20];
- a[23] = b[23];
- a[24] = b[24];
- a[27] = b[27];
- a[28] = b[28];
- a[31] = b[31];
-
- butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
- butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
- butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
- butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
- // Stage 7.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
- b[4] = a[4];
- b[5] = a[5];
- b[6] = a[6];
- b[7] = a[7];
-
- butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
- butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
- butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
- butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
- b[16] = vaddq_s16(a[16], a[17]);
- b[17] = vsubq_s16(a[16], a[17]);
- b[18] = vsubq_s16(a[19], a[18]);
- b[19] = vaddq_s16(a[19], a[18]);
- b[20] = vaddq_s16(a[20], a[21]);
- b[21] = vsubq_s16(a[20], a[21]);
- b[22] = vsubq_s16(a[23], a[22]);
- b[23] = vaddq_s16(a[23], a[22]);
- b[24] = vaddq_s16(a[24], a[25]);
- b[25] = vsubq_s16(a[24], a[25]);
- b[26] = vsubq_s16(a[27], a[26]);
- b[27] = vaddq_s16(a[27], a[26]);
- b[28] = vaddq_s16(a[28], a[29]);
- b[29] = vsubq_s16(a[28], a[29]);
- b[30] = vsubq_s16(a[31], a[30]);
- b[31] = vaddq_s16(a[31], a[30]);
-
- // Final stage.
- // Also compute partial rounding shift:
- // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- out[0] = sub_round_shift(b[0]);
- out[16] = sub_round_shift(b[1]);
- out[8] = sub_round_shift(b[2]);
- out[24] = sub_round_shift(b[3]);
- out[4] = sub_round_shift(b[4]);
- out[20] = sub_round_shift(b[5]);
- out[12] = sub_round_shift(b[6]);
- out[28] = sub_round_shift(b[7]);
- out[2] = sub_round_shift(b[8]);
- out[18] = sub_round_shift(b[9]);
- out[10] = sub_round_shift(b[10]);
- out[26] = sub_round_shift(b[11]);
- out[6] = sub_round_shift(b[12]);
- out[22] = sub_round_shift(b[13]);
- out[14] = sub_round_shift(b[14]);
- out[30] = sub_round_shift(b[15]);
-
- butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
- out[1] = sub_round_shift(a[1]);
- out[31] = sub_round_shift(a[31]);
-
- butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
- out[17] = sub_round_shift(a[17]);
- out[15] = sub_round_shift(a[15]);
-
- butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
- out[9] = sub_round_shift(a[9]);
- out[23] = sub_round_shift(a[23]);
-
- butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
- out[25] = sub_round_shift(a[25]);
- out[7] = sub_round_shift(a[7]);
-
- butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
- out[5] = sub_round_shift(a[5]);
- out[27] = sub_round_shift(a[27]);
-
- butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
- out[21] = sub_round_shift(a[21]);
- out[11] = sub_round_shift(a[11]);
-
- butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
- out[13] = sub_round_shift(a[13]);
- out[19] = sub_round_shift(a[19]);
-
- butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
- out[29] = sub_round_shift(a[29]);
- out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element) \
- do { \
- dst##_lo[element] = src##_lo[element]; \
- dst##_hi[element] = src##_hi[element]; \
- } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
- do { \
- c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
- c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
- } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
- do { \
- temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
- temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
- c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
- c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
- } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-// Like butterfly_one_coeff, but don't narrow results.
-static INLINE void butterfly_one_coeff_s16_s32(
- const int16x8_t a, const int16x8_t b, const tran_high_t constant,
- int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
- int32x4_t *sub_hi) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
- add_index, sub_index) \
- do { \
- butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
- &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
- const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
- int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
- const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
- const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
- const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
- const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
- sub_index) \
- do { \
- butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- constant, &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-// Like butterfly_two_coeff, but with s32.
-static INLINE void butterfly_two_coeff_s32(
- const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
- int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
- int32x4_t *sub_hi) {
- const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
- const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
- const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
- const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
- const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
- const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
- const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
- const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
- right_constant, b, add_index, sub_index) \
- do { \
- butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- left_constant, right_constant, &b##_lo[add_index], \
- &b##_hi[add_index], &b##_lo[sub_index], \
- &b##_hi[sub_index]); \
- } while (0)
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
- const int32x4_t a_hi) {
- const int32x4_t one = vdupq_n_s32(1);
- const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
- const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
- const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
- const int16x4_t b_lo =
- vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
- const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
- const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
- const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
- const int16x4_t b_hi =
- vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
- return vcombine_s16(b_lo, b_hi);
-}
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
- int32x4_t c_lo[32];
- int32x4_t c_hi[32];
- int32x4_t d_lo[32];
- int32x4_t d_hi[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
-
- b[16] = a[16];
- b[17] = a[17];
- b[18] = a[18];
- b[19] = a[19];
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
- b[28] = a[28];
- b[29] = a[29];
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 3. With extreme values for input this calculation rolls over int16_t.
- // The sources for b[0] get added multiple times and, through testing, have
- // been shown to overflow starting here.
- ADD_S16_S32(b, 0, 7, c, 0);
- ADD_S16_S32(b, 1, 6, c, 1);
- ADD_S16_S32(b, 2, 5, c, 2);
- ADD_S16_S32(b, 3, 4, c, 3);
- SUB_S16_S32(b, 3, 4, c, 4);
- SUB_S16_S32(b, 2, 5, c, 5);
- SUB_S16_S32(b, 1, 6, c, 6);
- SUB_S16_S32(b, 0, 7, c, 7);
-
- a[8] = b[8];
- a[9] = b[9];
-
- BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
- BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
- a[14] = b[14];
- a[15] = b[15];
-
- ADD_S16_S32(b, 16, 23, c, 16);
- ADD_S16_S32(b, 17, 22, c, 17);
- ADD_S16_S32(b, 18, 21, c, 18);
- ADD_S16_S32(b, 19, 20, c, 19);
- SUB_S16_S32(b, 19, 20, c, 20);
- SUB_S16_S32(b, 18, 21, c, 21);
- SUB_S16_S32(b, 17, 22, c, 22);
- SUB_S16_S32(b, 16, 23, c, 23);
- SUB_S16_S32(b, 31, 24, c, 24);
- SUB_S16_S32(b, 30, 25, c, 25);
- SUB_S16_S32(b, 29, 26, c, 26);
- SUB_S16_S32(b, 28, 27, c, 27);
- ADD_S16_S32(b, 28, 27, c, 28);
- ADD_S16_S32(b, 29, 26, c, 29);
- ADD_S16_S32(b, 30, 25, c, 30);
- ADD_S16_S32(b, 31, 24, c, 31);
-
- // Stage 4.
- ADD_S32(c, 0, 3, d, 0);
- ADD_S32(c, 1, 2, d, 1);
- SUB_S32(c, 1, 2, d, 2);
- SUB_S32(c, 0, 3, d, 3);
-
- PASS_THROUGH(c, d, 4);
-
- BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
- PASS_THROUGH(c, d, 7);
-
- ADDW_S16_S32(c, 11, a, 8, d, 8);
- ADDW_S16_S32(c, 10, a, 9, d, 9);
- SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
- SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
- SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
- SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
- ADDW_S16_S32(c, 13, b, 14, d, 14);
- ADDW_S16_S32(c, 12, b, 15, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 17);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
- BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
- BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
- PASS_THROUGH(c, d, 22);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 25);
-
- PASS_THROUGH(c, d, 30);
- PASS_THROUGH(c, d, 31);
-
- // Stage 5.
- BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
- BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
- ADD_S32(d, 4, 5, c, 4);
- SUB_S32(d, 4, 5, c, 5);
- SUB_S32(d, 7, 6, c, 6);
- ADD_S32(d, 7, 6, c, 7);
-
- PASS_THROUGH(d, c, 8);
-
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
- BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
- PASS_THROUGH(d, c, 11);
- PASS_THROUGH(d, c, 12);
- PASS_THROUGH(d, c, 15);
-
- ADD_S32(d, 16, 19, c, 16);
- ADD_S32(d, 17, 18, c, 17);
- SUB_S32(d, 17, 18, c, 18);
- SUB_S32(d, 16, 19, c, 19);
- SUB_S32(d, 23, 20, c, 20);
- SUB_S32(d, 22, 21, c, 21);
- ADD_S32(d, 22, 21, c, 22);
- ADD_S32(d, 23, 20, c, 23);
- ADD_S32(d, 24, 27, c, 24);
- ADD_S32(d, 25, 26, c, 25);
- SUB_S32(d, 25, 26, c, 26);
- SUB_S32(d, 24, 27, c, 27);
- SUB_S32(d, 31, 28, c, 28);
- SUB_S32(d, 30, 29, c, 29);
- ADD_S32(d, 30, 29, c, 30);
- ADD_S32(d, 31, 28, c, 31);
-
- // Stage 6.
- PASS_THROUGH(c, d, 0);
- PASS_THROUGH(c, d, 1);
- PASS_THROUGH(c, d, 2);
- PASS_THROUGH(c, d, 3);
-
- BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
- BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
- ADD_S32(c, 8, 9, d, 8);
- SUB_S32(c, 8, 9, d, 9);
- SUB_S32(c, 11, 10, d, 10);
- ADD_S32(c, 11, 10, d, 11);
- ADD_S32(c, 12, 13, d, 12);
- SUB_S32(c, 12, 13, d, 13);
- SUB_S32(c, 15, 14, d, 14);
- ADD_S32(c, 15, 14, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 19);
- PASS_THROUGH(c, d, 20);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 27);
- PASS_THROUGH(c, d, 28);
- PASS_THROUGH(c, d, 31);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
- BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
- BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
- // Stage 7.
- PASS_THROUGH(d, c, 0);
- PASS_THROUGH(d, c, 1);
- PASS_THROUGH(d, c, 2);
- PASS_THROUGH(d, c, 3);
- PASS_THROUGH(d, c, 4);
- PASS_THROUGH(d, c, 5);
- PASS_THROUGH(d, c, 6);
- PASS_THROUGH(d, c, 7);
-
- BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
- BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
- BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
- ADD_S32(d, 16, 17, c, 16);
- SUB_S32(d, 16, 17, c, 17);
- SUB_S32(d, 19, 18, c, 18);
- ADD_S32(d, 19, 18, c, 19);
- ADD_S32(d, 20, 21, c, 20);
- SUB_S32(d, 20, 21, c, 21);
- SUB_S32(d, 23, 22, c, 22);
- ADD_S32(d, 23, 22, c, 23);
- ADD_S32(d, 24, 25, c, 24);
- SUB_S32(d, 24, 25, c, 25);
- SUB_S32(d, 27, 26, c, 26);
- ADD_S32(d, 27, 26, c, 27);
- ADD_S32(d, 28, 29, c, 28);
- SUB_S32(d, 28, 29, c, 29);
- SUB_S32(d, 31, 30, c, 30);
- ADD_S32(d, 31, 30, c, 31);
-
- // Final stage.
- // Roll rounding into this function so we can pass back int16x8.
-
- out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
- out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
- out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
- out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
- out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
- out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
- out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
- out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
- out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
- out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
- out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
- out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
- out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
- out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
- out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
- out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
- out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
- out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
- out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
- out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
- out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
- out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
- out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
- out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
- BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
- out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
- out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
- out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
- out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
- BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
- out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
- out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
- BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
- out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
- out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
- const int16x8_t one = vdupq_n_s16(1);
- const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
- const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
- const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
- return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- // For the "rd" version, all the values are rounded down after stage 2 to keep
- // the values in 16 bits.
- b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
- b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
- b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
- b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
- b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
- b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
- b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
- b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
- b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
- b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
- b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
- b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
- b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
- b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
- b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
- b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
- b[16] = add_round_shift_s16(a[16]);
- b[17] = add_round_shift_s16(a[17]);
- b[18] = add_round_shift_s16(a[18]);
- b[19] = add_round_shift_s16(a[19]);
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
- b[20] = add_round_shift_s16(b[20]);
- b[21] = add_round_shift_s16(b[21]);
- b[22] = add_round_shift_s16(b[22]);
- b[23] = add_round_shift_s16(b[23]);
- b[24] = add_round_shift_s16(b[24]);
- b[25] = add_round_shift_s16(b[25]);
- b[26] = add_round_shift_s16(b[26]);
- b[27] = add_round_shift_s16(b[27]);
-
- b[28] = add_round_shift_s16(a[28]);
- b[29] = add_round_shift_s16(a[29]);
- b[30] = add_round_shift_s16(a[30]);
- b[31] = add_round_shift_s16(a[31]);
-
- // Stage 3.
- a[0] = vaddq_s16(b[0], b[7]);
- a[1] = vaddq_s16(b[1], b[6]);
- a[2] = vaddq_s16(b[2], b[5]);
- a[3] = vaddq_s16(b[3], b[4]);
-
- a[4] = vsubq_s16(b[3], b[4]);
- a[5] = vsubq_s16(b[2], b[5]);
- a[6] = vsubq_s16(b[1], b[6]);
- a[7] = vsubq_s16(b[0], b[7]);
-
- a[8] = b[8];
- a[9] = b[9];
-
- butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
- butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
- a[14] = b[14];
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[16], b[23]);
- a[17] = vaddq_s16(b[17], b[22]);
- a[18] = vaddq_s16(b[18], b[21]);
- a[19] = vaddq_s16(b[19], b[20]);
-
- a[20] = vsubq_s16(b[19], b[20]);
- a[21] = vsubq_s16(b[18], b[21]);
- a[22] = vsubq_s16(b[17], b[22]);
- a[23] = vsubq_s16(b[16], b[23]);
-
- a[24] = vsubq_s16(b[31], b[24]);
- a[25] = vsubq_s16(b[30], b[25]);
- a[26] = vsubq_s16(b[29], b[26]);
- a[27] = vsubq_s16(b[28], b[27]);
-
- a[28] = vaddq_s16(b[28], b[27]);
- a[29] = vaddq_s16(b[29], b[26]);
- a[30] = vaddq_s16(b[30], b[25]);
- a[31] = vaddq_s16(b[31], b[24]);
-
- // Stage 4.
- b[0] = vaddq_s16(a[0], a[3]);
- b[1] = vaddq_s16(a[1], a[2]);
- b[2] = vsubq_s16(a[1], a[2]);
- b[3] = vsubq_s16(a[0], a[3]);
-
- b[4] = a[4];
-
- butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
- b[7] = a[7];
-
- b[8] = vaddq_s16(a[8], a[11]);
- b[9] = vaddq_s16(a[9], a[10]);
- b[10] = vsubq_s16(a[9], a[10]);
- b[11] = vsubq_s16(a[8], a[11]);
- b[12] = vsubq_s16(a[15], a[12]);
- b[13] = vsubq_s16(a[14], a[13]);
- b[14] = vaddq_s16(a[14], a[13]);
- b[15] = vaddq_s16(a[15], a[12]);
-
- b[16] = a[16];
- b[17] = a[17];
-
- butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
- butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
- butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
- butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
- b[22] = a[22];
- b[23] = a[23];
- b[24] = a[24];
- b[25] = a[25];
-
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 5.
- butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
- butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
- a[4] = vaddq_s16(b[4], b[5]);
- a[5] = vsubq_s16(b[4], b[5]);
- a[6] = vsubq_s16(b[7], b[6]);
- a[7] = vaddq_s16(b[7], b[6]);
-
- a[8] = b[8];
-
- butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
- butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
- a[11] = b[11];
- a[12] = b[12];
-
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[19], b[16]);
- a[17] = vaddq_s16(b[18], b[17]);
- a[18] = vsubq_s16(b[17], b[18]);
- a[19] = vsubq_s16(b[16], b[19]);
- a[20] = vsubq_s16(b[23], b[20]);
- a[21] = vsubq_s16(b[22], b[21]);
- a[22] = vaddq_s16(b[21], b[22]);
- a[23] = vaddq_s16(b[20], b[23]);
- a[24] = vaddq_s16(b[27], b[24]);
- a[25] = vaddq_s16(b[26], b[25]);
- a[26] = vsubq_s16(b[25], b[26]);
- a[27] = vsubq_s16(b[24], b[27]);
- a[28] = vsubq_s16(b[31], b[28]);
- a[29] = vsubq_s16(b[30], b[29]);
- a[30] = vaddq_s16(b[29], b[30]);
- a[31] = vaddq_s16(b[28], b[31]);
-
- // Stage 6.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
-
- butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
- butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
- b[8] = vaddq_s16(a[8], a[9]);
- b[9] = vsubq_s16(a[8], a[9]);
- b[10] = vsubq_s16(a[11], a[10]);
- b[11] = vaddq_s16(a[11], a[10]);
- b[12] = vaddq_s16(a[12], a[13]);
- b[13] = vsubq_s16(a[12], a[13]);
- b[14] = vsubq_s16(a[15], a[14]);
- b[15] = vaddq_s16(a[15], a[14]);
-
- b[16] = a[16];
- b[19] = a[19];
- b[20] = a[20];
- b[23] = a[23];
- b[24] = a[24];
- b[27] = a[27];
- b[28] = a[28];
- b[31] = a[31];
-
- butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
- butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
- butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
- butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
- // Stage 7.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
- a[4] = b[4];
- a[5] = b[5];
- a[6] = b[6];
- a[7] = b[7];
-
- butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
- butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
- butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
- butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
- a[16] = vaddq_s16(b[16], b[17]);
- a[17] = vsubq_s16(b[16], b[17]);
- a[18] = vsubq_s16(b[19], b[18]);
- a[19] = vaddq_s16(b[19], b[18]);
- a[20] = vaddq_s16(b[20], b[21]);
- a[21] = vsubq_s16(b[20], b[21]);
- a[22] = vsubq_s16(b[23], b[22]);
- a[23] = vaddq_s16(b[23], b[22]);
- a[24] = vaddq_s16(b[24], b[25]);
- a[25] = vsubq_s16(b[24], b[25]);
- a[26] = vsubq_s16(b[27], b[26]);
- a[27] = vaddq_s16(b[27], b[26]);
- a[28] = vaddq_s16(b[28], b[29]);
- a[29] = vsubq_s16(b[28], b[29]);
- a[30] = vsubq_s16(b[31], b[30]);
- a[31] = vaddq_s16(b[31], b[30]);
-
- // Final stage.
- out[0] = a[0];
- out[16] = a[1];
- out[8] = a[2];
- out[24] = a[3];
- out[4] = a[4];
- out[20] = a[5];
- out[12] = a[6];
- out[28] = a[7];
- out[2] = a[8];
- out[18] = a[9];
- out[10] = a[10];
- out[26] = a[11];
- out[6] = a[12];
- out[22] = a[13];
- out[14] = a[14];
- out[30] = a[15];
-
- butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
- butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
- &out[15]);
- butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
- butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
- butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
- butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
- &out[11]);
- butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
- &out[19]);
- butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-// TODO(johannkoenig): share with other fdcts.
-static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
-}
-
void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp0[32];
int16x8_t temp1[32];
@@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_8x8(&temp1[0], &temp0[0]);
- transpose_8x8(&temp2[0], &temp0[8]);
- transpose_8x8(&temp3[0], &temp0[16]);
- transpose_8x8(&temp4[0], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[0], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output, temp5);
// Second row of 8x32.
- transpose_8x8(&temp1[8], &temp0[0]);
- transpose_8x8(&temp2[8], &temp0[8]);
- transpose_8x8(&temp3[8], &temp0[16]);
- transpose_8x8(&temp4[8], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[8], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_8x8(&temp1[16], &temp0[0]);
- transpose_8x8(&temp2[16], &temp0[8]);
- transpose_8x8(&temp3[16], &temp0[16]);
- transpose_8x8(&temp4[16], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[16], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_8x8(&temp1[24], &temp0[0]);
- transpose_8x8(&temp2[24], &temp0[8]);
- transpose_8x8(&temp3[24], &temp0[16]);
- transpose_8x8(&temp4[24], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[24], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_8x8(&temp1[0], &temp0[0]);
- transpose_8x8(&temp2[0], &temp0[8]);
- transpose_8x8(&temp3[0], &temp0[16]);
- transpose_8x8(&temp4[0], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[0], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output, temp5);
// Second row of 8x32.
- transpose_8x8(&temp1[8], &temp0[0]);
- transpose_8x8(&temp2[8], &temp0[8]);
- transpose_8x8(&temp3[8], &temp0[16]);
- transpose_8x8(&temp4[8], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[8], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_8x8(&temp1[16], &temp0[0]);
- transpose_8x8(&temp2[16], &temp0[8]);
- transpose_8x8(&temp3[16], &temp0[16]);
- transpose_8x8(&temp4[16], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[16], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_8x8(&temp1[24], &temp0[0]);
- transpose_8x8(&temp2[24], &temp0[8]);
- transpose_8x8(&temp3[24], &temp0[16]);
- transpose_8x8(&temp4[24], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[24], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
&temp5[29], &temp5[30], &temp5[31]);
store(output + 24 * 32, temp5);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass(left5, right5);
+ highbd_partial_add_round_shift(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass(left6, right6);
+ highbd_partial_add_round_shift(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass(left7, right7);
+ highbd_partial_add_round_shift(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass(left8, right8);
+ highbd_partial_add_round_shift(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
// __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 000000000..3b9e64c6d
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+ b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+ b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+ b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+ b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+ b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+ b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+ int16x8_t *out /*32*/) {
+ out[0] = vshlq_n_s16(in[0], 2);
+ out[1] = vshlq_n_s16(in[1], 2);
+ out[2] = vshlq_n_s16(in[2], 2);
+ out[3] = vshlq_n_s16(in[3], 2);
+ out[4] = vshlq_n_s16(in[4], 2);
+ out[5] = vshlq_n_s16(in[5], 2);
+ out[6] = vshlq_n_s16(in[6], 2);
+ out[7] = vshlq_n_s16(in[7], 2);
+
+ out[8] = vshlq_n_s16(in[8], 2);
+ out[9] = vshlq_n_s16(in[9], 2);
+ out[10] = vshlq_n_s16(in[10], 2);
+ out[11] = vshlq_n_s16(in[11], 2);
+ out[12] = vshlq_n_s16(in[12], 2);
+ out[13] = vshlq_n_s16(in[13], 2);
+ out[14] = vshlq_n_s16(in[14], 2);
+ out[15] = vshlq_n_s16(in[15], 2);
+
+ out[16] = vshlq_n_s16(in[16], 2);
+ out[17] = vshlq_n_s16(in[17], 2);
+ out[18] = vshlq_n_s16(in[18], 2);
+ out[19] = vshlq_n_s16(in[19], 2);
+ out[20] = vshlq_n_s16(in[20], 2);
+ out[21] = vshlq_n_s16(in[21], 2);
+ out[22] = vshlq_n_s16(in[22], 2);
+ out[23] = vshlq_n_s16(in[23], 2);
+
+ out[24] = vshlq_n_s16(in[24], 2);
+ out[25] = vshlq_n_s16(in[25], 2);
+ out[26] = vshlq_n_s16(in[26], 2);
+ out[27] = vshlq_n_s16(in[27], 2);
+ out[28] = vshlq_n_s16(in[28], 2);
+ out[29] = vshlq_n_s16(in[29], 2);
+ out[30] = vshlq_n_s16(in[30], 2);
+ out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+ &a[20]);
+ butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+ &a[21]);
+ butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+ &a[22]);
+ butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+ &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift_s16(b[0]);
+ out[16] = sub_round_shift_s16(b[1]);
+ out[8] = sub_round_shift_s16(b[2]);
+ out[24] = sub_round_shift_s16(b[3]);
+ out[4] = sub_round_shift_s16(b[4]);
+ out[20] = sub_round_shift_s16(b[5]);
+ out[12] = sub_round_shift_s16(b[6]);
+ out[28] = sub_round_shift_s16(b[7]);
+ out[2] = sub_round_shift_s16(b[8]);
+ out[18] = sub_round_shift_s16(b[9]);
+ out[10] = sub_round_shift_s16(b[10]);
+ out[26] = sub_round_shift_s16(b[11]);
+ out[6] = sub_round_shift_s16(b[12]);
+ out[22] = sub_round_shift_s16(b[13]);
+ out[14] = sub_round_shift_s16(b[14]);
+ out[30] = sub_round_shift_s16(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+ out[1] = sub_round_shift_s16(a[1]);
+ out[31] = sub_round_shift_s16(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+ out[17] = sub_round_shift_s16(a[17]);
+ out[15] = sub_round_shift_s16(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+ out[9] = sub_round_shift_s16(a[9]);
+ out[23] = sub_round_shift_s16(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+ out[25] = sub_round_shift_s16(a[25]);
+ out[7] = sub_round_shift_s16(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+ out[5] = sub_round_shift_s16(a[5]);
+ out[27] = sub_round_shift_s16(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+ out[21] = sub_round_shift_s16(a[21]);
+ out[11] = sub_round_shift_s16(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+ out[13] = sub_round_shift_s16(a[13]);
+ out[19] = sub_round_shift_s16(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+ out[29] = sub_round_shift_s16(a[29]);
+ out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32_fast( \
+ a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
+ a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+ out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+ out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+ out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+ out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+ out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+ out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+ out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+ out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+ tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+ const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+ const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+ const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+ int i;
+ for (i = 0; i < 32; i++) {
+ vst1q_s32(a, l1[i]);
+ vst1q_s32(a + 4, r1[i]);
+ vst1q_s32(a + 8, l2[i]);
+ vst1q_s32(a + 12, r2[i]);
+ vst1q_s32(a + 16, l3[i]);
+ vst1q_s32(a + 20, r3[i]);
+ vst1q_s32(a + 24, l4[i]);
+ vst1q_s32(a + 28, r4[i]);
+ a += 32;
+ }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+ int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+ left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+ left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+ left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+ left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+ left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+ left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+ left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+ left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+ left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+ left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+ left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+ left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+ left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+ left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+ left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+ left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+ right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+ right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+ right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+ right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+ right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+ right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+ right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+ right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+ right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+ right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+ right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+ right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+ right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+ right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+ right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+ right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+ int32x4_t *a_right /*[32]*/,
+ int32x4_t *b_left /*[32]*/,
+ int32x4_t *b_right /*[32]*/) {
+ // Stage 1. Done as part of the load for the first pass.
+ b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+ b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+ b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+ b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+ b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+ b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+ b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+ b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+ b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+ b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+ b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+ b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+ b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+ b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+ b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+ b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+ b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+ b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+ b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+ b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+ b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+ b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+ b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+ b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+ b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+ b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+ b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+ b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+ b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+ b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+ b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+ b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+ b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+ b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+ b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+ b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+ b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+ b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+ b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+ b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+ b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+ b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+ b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+ b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+ b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+ b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+ b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+ b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+ b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = add_round_shift_s32(left[0]);
+ left[1] = add_round_shift_s32(left[1]);
+ left[2] = add_round_shift_s32(left[2]);
+ left[3] = add_round_shift_s32(left[3]);
+ left[4] = add_round_shift_s32(left[4]);
+ left[5] = add_round_shift_s32(left[5]);
+ left[6] = add_round_shift_s32(left[6]);
+ left[7] = add_round_shift_s32(left[7]);
+ left[8] = add_round_shift_s32(left[8]);
+ left[9] = add_round_shift_s32(left[9]);
+ left[10] = add_round_shift_s32(left[10]);
+ left[11] = add_round_shift_s32(left[11]);
+ left[12] = add_round_shift_s32(left[12]);
+ left[13] = add_round_shift_s32(left[13]);
+ left[14] = add_round_shift_s32(left[14]);
+ left[15] = add_round_shift_s32(left[15]);
+ left[16] = add_round_shift_s32(left[16]);
+ left[17] = add_round_shift_s32(left[17]);
+ left[18] = add_round_shift_s32(left[18]);
+ left[19] = add_round_shift_s32(left[19]);
+ left[20] = add_round_shift_s32(left[20]);
+ left[21] = add_round_shift_s32(left[21]);
+ left[22] = add_round_shift_s32(left[22]);
+ left[23] = add_round_shift_s32(left[23]);
+ left[24] = add_round_shift_s32(left[24]);
+ left[25] = add_round_shift_s32(left[25]);
+ left[26] = add_round_shift_s32(left[26]);
+ left[27] = add_round_shift_s32(left[27]);
+ left[28] = add_round_shift_s32(left[28]);
+ left[29] = add_round_shift_s32(left[29]);
+ left[30] = add_round_shift_s32(left[30]);
+ left[31] = add_round_shift_s32(left[31]);
+
+ right[0] = add_round_shift_s32(right[0]);
+ right[1] = add_round_shift_s32(right[1]);
+ right[2] = add_round_shift_s32(right[2]);
+ right[3] = add_round_shift_s32(right[3]);
+ right[4] = add_round_shift_s32(right[4]);
+ right[5] = add_round_shift_s32(right[5]);
+ right[6] = add_round_shift_s32(right[6]);
+ right[7] = add_round_shift_s32(right[7]);
+ right[8] = add_round_shift_s32(right[8]);
+ right[9] = add_round_shift_s32(right[9]);
+ right[10] = add_round_shift_s32(right[10]);
+ right[11] = add_round_shift_s32(right[11]);
+ right[12] = add_round_shift_s32(right[12]);
+ right[13] = add_round_shift_s32(right[13]);
+ right[14] = add_round_shift_s32(right[14]);
+ right[15] = add_round_shift_s32(right[15]);
+ right[16] = add_round_shift_s32(right[16]);
+ right[17] = add_round_shift_s32(right[17]);
+ right[18] = add_round_shift_s32(right[18]);
+ right[19] = add_round_shift_s32(right[19]);
+ right[20] = add_round_shift_s32(right[20]);
+ right[21] = add_round_shift_s32(right[21]);
+ right[22] = add_round_shift_s32(right[22]);
+ right[23] = add_round_shift_s32(right[23]);
+ right[24] = add_round_shift_s32(right[24]);
+ right[25] = add_round_shift_s32(right[25]);
+ right[26] = add_round_shift_s32(right[26]);
+ right[27] = add_round_shift_s32(right[27]);
+ right[28] = add_round_shift_s32(right[28]);
+ right[29] = add_round_shift_s32(right[29]);
+ right[30] = add_round_shift_s32(right[30]);
+ right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = sub_round_shift_s32(left[0]);
+ left[1] = sub_round_shift_s32(left[1]);
+ left[2] = sub_round_shift_s32(left[2]);
+ left[3] = sub_round_shift_s32(left[3]);
+ left[4] = sub_round_shift_s32(left[4]);
+ left[5] = sub_round_shift_s32(left[5]);
+ left[6] = sub_round_shift_s32(left[6]);
+ left[7] = sub_round_shift_s32(left[7]);
+ left[8] = sub_round_shift_s32(left[8]);
+ left[9] = sub_round_shift_s32(left[9]);
+ left[10] = sub_round_shift_s32(left[10]);
+ left[11] = sub_round_shift_s32(left[11]);
+ left[12] = sub_round_shift_s32(left[12]);
+ left[13] = sub_round_shift_s32(left[13]);
+ left[14] = sub_round_shift_s32(left[14]);
+ left[15] = sub_round_shift_s32(left[15]);
+ left[16] = sub_round_shift_s32(left[16]);
+ left[17] = sub_round_shift_s32(left[17]);
+ left[18] = sub_round_shift_s32(left[18]);
+ left[19] = sub_round_shift_s32(left[19]);
+ left[20] = sub_round_shift_s32(left[20]);
+ left[21] = sub_round_shift_s32(left[21]);
+ left[22] = sub_round_shift_s32(left[22]);
+ left[23] = sub_round_shift_s32(left[23]);
+ left[24] = sub_round_shift_s32(left[24]);
+ left[25] = sub_round_shift_s32(left[25]);
+ left[26] = sub_round_shift_s32(left[26]);
+ left[27] = sub_round_shift_s32(left[27]);
+ left[28] = sub_round_shift_s32(left[28]);
+ left[29] = sub_round_shift_s32(left[29]);
+ left[30] = sub_round_shift_s32(left[30]);
+ left[31] = sub_round_shift_s32(left[31]);
+
+ right[0] = sub_round_shift_s32(right[0]);
+ right[1] = sub_round_shift_s32(right[1]);
+ right[2] = sub_round_shift_s32(right[2]);
+ right[3] = sub_round_shift_s32(right[3]);
+ right[4] = sub_round_shift_s32(right[4]);
+ right[5] = sub_round_shift_s32(right[5]);
+ right[6] = sub_round_shift_s32(right[6]);
+ right[7] = sub_round_shift_s32(right[7]);
+ right[8] = sub_round_shift_s32(right[8]);
+ right[9] = sub_round_shift_s32(right[9]);
+ right[10] = sub_round_shift_s32(right[10]);
+ right[11] = sub_round_shift_s32(right[11]);
+ right[12] = sub_round_shift_s32(right[12]);
+ right[13] = sub_round_shift_s32(right[13]);
+ right[14] = sub_round_shift_s32(right[14]);
+ right[15] = sub_round_shift_s32(right[15]);
+ right[16] = sub_round_shift_s32(right[16]);
+ right[17] = sub_round_shift_s32(right[17]);
+ right[18] = sub_round_shift_s32(right[18]);
+ right[19] = sub_round_shift_s32(right[19]);
+ right[20] = sub_round_shift_s32(right[20]);
+ right[21] = sub_round_shift_s32(right[21]);
+ right[22] = sub_round_shift_s32(right[22]);
+ right[23] = sub_round_shift_s32(right[23]);
+ right[24] = sub_round_shift_s32(right[24]);
+ right[25] = sub_round_shift_s32(right[25]);
+ right[26] = sub_round_shift_s32(right[26]);
+ right[27] = sub_round_shift_s32(right[27]);
+ right[28] = sub_round_shift_s32(right[28]);
+ right[29] = sub_round_shift_s32(right[29]);
+ right[30] = sub_round_shift_s32(right[30]);
+ right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+ ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+ al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+ ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+ al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+ ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+ al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+ ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+ al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+ ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+ al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+ ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+ al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+ ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+ al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+ ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+ al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+ ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+ al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+ ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+ al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+ ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+ al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+ ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+ al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+ ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+ al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+ ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+ al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+ ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+ al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+ ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+ al[16] = add_round_shift_s32(left[16]);
+ ar[16] = add_round_shift_s32(right[16]);
+ al[17] = add_round_shift_s32(left[17]);
+ ar[17] = add_round_shift_s32(right[17]);
+ al[18] = add_round_shift_s32(left[18]);
+ ar[18] = add_round_shift_s32(right[18]);
+ al[19] = add_round_shift_s32(left[19]);
+ ar[19] = add_round_shift_s32(right[19]);
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[20] = add_round_shift_s32(al[20]);
+ ar[20] = add_round_shift_s32(ar[20]);
+ al[21] = add_round_shift_s32(al[21]);
+ ar[21] = add_round_shift_s32(ar[21]);
+ al[22] = add_round_shift_s32(al[22]);
+ ar[22] = add_round_shift_s32(ar[22]);
+ al[23] = add_round_shift_s32(al[23]);
+ ar[23] = add_round_shift_s32(ar[23]);
+ al[24] = add_round_shift_s32(al[24]);
+ ar[24] = add_round_shift_s32(ar[24]);
+ al[25] = add_round_shift_s32(al[25]);
+ ar[25] = add_round_shift_s32(ar[25]);
+ al[26] = add_round_shift_s32(al[26]);
+ ar[26] = add_round_shift_s32(ar[26]);
+ al[27] = add_round_shift_s32(al[27]);
+ ar[27] = add_round_shift_s32(ar[27]);
+
+ al[28] = add_round_shift_s32(left[28]);
+ ar[28] = add_round_shift_s32(right[28]);
+ al[29] = add_round_shift_s32(left[29]);
+ ar[29] = add_round_shift_s32(right[29]);
+ al[30] = add_round_shift_s32(left[30]);
+ ar[30] = add_round_shift_s32(right[30]);
+ al[31] = add_round_shift_s32(left[31]);
+ ar[31] = add_round_shift_s32(right[31]);
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[16], al[23]);
+ br[16] = vaddq_s32(ar[16], ar[23]);
+ bl[17] = vaddq_s32(al[17], al[22]);
+ br[17] = vaddq_s32(ar[17], ar[22]);
+ bl[18] = vaddq_s32(al[18], al[21]);
+ br[18] = vaddq_s32(ar[18], ar[21]);
+ bl[19] = vaddq_s32(al[19], al[20]);
+ br[19] = vaddq_s32(ar[19], ar[20]);
+
+ bl[20] = vsubq_s32(al[19], al[20]);
+ br[20] = vsubq_s32(ar[19], ar[20]);
+ bl[21] = vsubq_s32(al[18], al[21]);
+ br[21] = vsubq_s32(ar[18], ar[21]);
+ bl[22] = vsubq_s32(al[17], al[22]);
+ br[22] = vsubq_s32(ar[17], ar[22]);
+ bl[23] = vsubq_s32(al[16], al[23]);
+ br[23] = vsubq_s32(ar[16], ar[23]);
+
+ bl[24] = vsubq_s32(al[31], al[24]);
+ br[24] = vsubq_s32(ar[31], ar[24]);
+ bl[25] = vsubq_s32(al[30], al[25]);
+ br[25] = vsubq_s32(ar[30], ar[25]);
+ bl[26] = vsubq_s32(al[29], al[26]);
+ br[26] = vsubq_s32(ar[29], ar[26]);
+ bl[27] = vsubq_s32(al[28], al[27]);
+ br[27] = vsubq_s32(ar[28], ar[27]);
+
+ bl[28] = vaddq_s32(al[28], al[27]);
+ br[28] = vaddq_s32(ar[28], ar[27]);
+ bl[29] = vaddq_s32(al[29], al[26]);
+ br[29] = vaddq_s32(ar[29], ar[26]);
+ bl[30] = vaddq_s32(al[30], al[25]);
+ br[30] = vaddq_s32(ar[30], ar[25]);
+ bl[31] = vaddq_s32(al[31], al[24]);
+ br[31] = vaddq_s32(ar[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+ -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+ -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+ &bl[2], &br[2], &bl[3], &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+ &bl[14], &br[14], &bl[9], &br[9]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+ -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+ &al[4], &ar[4], &al[7], &ar[7]);
+ butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+ &al[5], &ar[5], &al[6], &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+ -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+ cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+ -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+ &bl[8], &br[8], &bl[15], &br[15]);
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+ cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+ butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+ cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+ cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+ cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+ cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+ cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+ cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 2827791f1..3b9196fae 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
// input[M * stride] * 16
int16x4_t in[4];
in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
in[0] = vadd_s16(in[0], one);
}
- for (i = 0; i < 2; ++i) {
- vpx_fdct4x4_pass1_neon(in);
- }
+ vpx_fdct4x4_pass1_neon(in);
+ vpx_fdct4x4_pass2_neon(in);
{
// Not quite a rounding shift. Only add 1 despite shifting by 2.
const int16x8_t one = vdupq_n_s16(1);
@@ -48,3 +47,39 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
store_s16q_to_tran_low(final_output + 1 * 8, out_23);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+ const int32x4_t const_one = vdupq_n_s32(1);
+
+ // input[M * stride] * 16
+ int32x4_t in[4];
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ in[0] = vaddq_s32(in[0], const_1000);
+ }
+
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+ in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+ in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+ in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+ vst1q_s32(final_output, in[0]);
+ vst1q_s32(final_output + 4, in[1]);
+ vst1q_s32(final_output + 8, in[2]);
+ vst1q_s32(final_output + 12, in[3]);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 000000000..de3db9774
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+ &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+ int32x4_t out[4];
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+ const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+ const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+ butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+ &out[1], &out[3]);
+
+ transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 000000000..75ee6f223
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // stage 1
+ int16x8_t in[8];
+ in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+ vpx_fdct8x8_pass1_neon(in);
+ vpx_fdct8x8_pass2_neon(in);
+ {
+ // from vpx_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+ in[0] = vhsubq_s16(in[0], sign_in0);
+ in[1] = vhsubq_s16(in[1], sign_in1);
+ in[2] = vhsubq_s16(in[2], sign_in2);
+ in[3] = vhsubq_s16(in[3], sign_in3);
+ in[4] = vhsubq_s16(in[4], sign_in4);
+ in[5] = vhsubq_s16(in[5], sign_in5);
+ in[6] = vhsubq_s16(in[6], sign_in6);
+ in[7] = vhsubq_s16(in[7], sign_in7);
+ // store results
+ store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+ store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+ store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+ store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+ store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+ store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+ store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+ store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int32x4_t left[8], right[8];
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+
+ left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ vpx_highbd_fdct8x8_pass2_neon(left, right);
+ {
+ left[0] = add_round_shift_half_s32(left[0]);
+ left[1] = add_round_shift_half_s32(left[1]);
+ left[2] = add_round_shift_half_s32(left[2]);
+ left[3] = add_round_shift_half_s32(left[3]);
+ left[4] = add_round_shift_half_s32(left[4]);
+ left[5] = add_round_shift_half_s32(left[5]);
+ left[6] = add_round_shift_half_s32(left[6]);
+ left[7] = add_round_shift_half_s32(left[7]);
+ right[0] = add_round_shift_half_s32(right[0]);
+ right[1] = add_round_shift_half_s32(right[1]);
+ right[2] = add_round_shift_half_s32(right[2]);
+ right[3] = add_round_shift_half_s32(right[3]);
+ right[4] = add_round_shift_half_s32(right[4]);
+ right[5] = add_round_shift_half_s32(right[5]);
+ right[6] = add_round_shift_half_s32(right[6]);
+ right[7] = add_round_shift_half_s32(right[7]);
+
+ // store results
+ vst1q_s32(final_output, left[0]);
+ vst1q_s32(final_output + 4, right[0]);
+ vst1q_s32(final_output + 8, left[1]);
+ vst1q_s32(final_output + 12, right[1]);
+ vst1q_s32(final_output + 16, left[2]);
+ vst1q_s32(final_output + 20, right[2]);
+ vst1q_s32(final_output + 24, left[3]);
+ vst1q_s32(final_output + 28, right[3]);
+ vst1q_s32(final_output + 32, left[4]);
+ vst1q_s32(final_output + 36, right[4]);
+ vst1q_s32(final_output + 40, left[5]);
+ vst1q_s32(final_output + 44, right[5]);
+ vst1q_s32(final_output + 48, left[6]);
+ vst1q_s32(final_output + 52, right[6]);
+ vst1q_s32(final_output + 56, left[7]);
+ vst1q_s32(final_output + 60, right[7]);
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 000000000..d8fa60044
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+ &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass2_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+ &left[2], &right[2], &left[6], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+ &left[1], &right[1], &left[7], &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+ &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[2], &right[2], &left[6],
+ &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[1], &right[1], &left[7],
+ &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[5], &right[5], &left[3],
+ &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 28d7d86bf..193594e3d 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -13,201 +13,411 @@
#include <arm_neon.h>
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
- const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
- const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
- // in_0 +/- in_3, in_1 +/- in_2
- const int16x8_t s_01 = vaddq_s16(input_01, input_32);
- const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
- // step_0 +/- step_1, step_2 +/- step_3
- const int16x4_t s_0 = vget_low_s16(s_01);
- const int16x4_t s_1 = vget_high_s16(s_01);
- const int16x4_t s_2 = vget_high_s16(s_32);
- const int16x4_t s_3 = vget_low_s16(s_32);
-
- // (s_0 +/- s_1) * cospi_16_64
- // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
- const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
- const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
- // fdct_round_shift
- int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
- int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
- // s_3 * cospi_8_64 + s_2 * cospi_24_64
- // s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
- // fdct_round_shift
- int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
- int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
- in[0] = out_0;
- in[1] = out_1;
- in[2] = out_2;
- in[3] = out_3;
-}
-
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
- int16x8_t *out) {
- const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
- const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
- const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
- const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
- const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
- const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
- const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
- const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
- // fdct4(step, step);
- int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
- int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
- int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
- int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
- // fdct4(step, step);
- int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
- out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
- out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
- out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
- }
- // Stage 2
- v_x0 = vsubq_s16(v_s6, v_s5);
- v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x8_t ab = vcombine_s16(a, b);
- const int16x8_t cd = vcombine_s16(c, d);
- // Stage 3
- v_x0 = vaddq_s16(v_s4, ab);
- v_x1 = vsubq_s16(v_s4, ab);
- v_x2 = vsubq_s16(v_s7, cd);
- v_x3 = vaddq_s16(v_s7, cd);
- }
- // Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
- out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
- out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
- out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
- }
-}
-
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
- int16x8_t out[8];
- vpx_fdct8x8_pass1_notranspose_neon(in, out);
- // transpose 8x8
- // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
- // columns.
- {
- // 00 01 02 03 40 41 42 43
- // 10 11 12 13 50 51 52 53
- // 20 21 22 23 60 61 62 63
- // 30 31 32 33 70 71 72 73
- // 04 05 06 07 44 45 46 47
- // 14 15 16 17 54 55 56 57
- // 24 25 26 27 64 65 66 67
- // 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
- const int32x4x2_t r13_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
- const int32x4x2_t r46_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
- const int32x4x2_t r57_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
- const int16x8x2_t r01_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
- vreinterpretq_s16_s32(r13_s32.val[0]));
- const int16x8x2_t r23_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
- vreinterpretq_s16_s32(r13_s32.val[1]));
- const int16x8x2_t r45_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
- vreinterpretq_s16_s32(r57_s32.val[0]));
- const int16x8x2_t r67_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
- vreinterpretq_s16_s32(r57_s32.val[1]));
- in[0] = r01_s16.val[0];
- in[1] = r01_s16.val[1];
- in[2] = r23_s16.val[0];
- in[3] = r23_s16.val[1];
- in[4] = r45_s16.val[0];
- in[5] = r45_s16.val[1];
- in[6] = r67_s16.val[0];
- in[7] = r67_s16.val[1];
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant,
+ int16x4_t *add,
+ int16x4_t *sub) {
+ int16x4_t c = vdup_n_s16(2 * constant);
+ *add = vqrdmulh_s16(vadd_s16(a, b), c);
+ *sub = vqrdmulh_s16(vsub_s16(a, b), c);
}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+ const int16x8_t b,
+ const tran_coef_t constant,
+ int16x8_t *add,
+ int16x8_t *sub) {
+ int16x8_t c = vdupq_n_s16(2 * constant);
+ *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+ *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ const int16x4_t a_lo = vget_low_s16(a);
+ const int16x4_t a_hi = vget_high_s16(a);
+ const int16x4_t b_lo = vget_low_s16(b);
+ const int16x4_t b_hi = vget_high_s16(b);
+ *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+ butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+ &sub_hi);
+ *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+ *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int32x4_t *add, int32x4_t *sub) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+ *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int16x4_t *add, int16x4_t *sub) {
+ int32x4_t add32, sub32;
+ butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+ *add = vmovn_s32(add32);
+ *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+ butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+ &sub32_hi);
+ *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+ *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+ const int32x4_t b,
+ const tran_coef_t constant,
+ int32x4_t *add,
+ int32x4_t *sub) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+ *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+ const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+ const int32x2_t a_lo = vget_low_s32(a);
+ const int32x2_t a_hi = vget_high_s32(a);
+ const int32x2_t b_lo = vget_low_s32(b);
+ const int32x2_t b_hi = vget_high_s32(b);
+
+ const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+ const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+ const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+ const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+ const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+ const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+ const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+ const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+ *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+ *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+ const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+ const int16x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+ const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+ const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+ const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+ *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+ *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+ *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+ *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x4_t *add, int16x4_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(a, constant1);
+ const int32x4_t a2 = vmull_n_s16(a, constant2);
+ const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+ const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+ *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+ *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+ const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+ const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+ const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+ const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+ const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+ const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index 0a1cdca41..718dba0d9 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
output[0] = (tran_low_t)(sum >> 3);
output[1] = 0;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+ r++;
+ } while (r < 16);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a0 = vld1q_s16(input);
+ const int16x8_t a1 = vld1q_s16(input + 8);
+ const int16x8_t a2 = vld1q_s16(input + 16);
+ const int16x8_t a3 = vld1q_s16(input + 24);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+ r++;
+ } while (r < 32);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index d9161c6d3..000000000
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/fdct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-
-void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
- int stride) {
- int i;
- // stage 1
- int16x8_t in[8];
- in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
- in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
- in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
- in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
- in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
- in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
- in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
- in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
- for (i = 0; i < 2; ++i) {
- vpx_fdct8x8_pass1_neon(in);
- } // for
- {
- // from vpx_dct_sse2.c
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
- const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
- const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
- const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
- const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
- const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
- const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
- const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
- in[0] = vhsubq_s16(in[0], sign_in0);
- in[1] = vhsubq_s16(in[1], sign_in1);
- in[2] = vhsubq_s16(in[2], sign_in2);
- in[3] = vhsubq_s16(in[3], sign_in3);
- in[4] = vhsubq_s16(in[4], sign_in4);
- in[5] = vhsubq_s16(in[5], sign_in5);
- in[6] = vhsubq_s16(in[6], sign_in6);
- in[7] = vhsubq_s16(in[7], sign_in7);
- // store results
- store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
- store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
- store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
- store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
- store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
- store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
- store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
- store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
- }
-}
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index 523a63c6f..f6b6d7e3c 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
coeff += 8;
}
}
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 32x32 to 16x64 and remove stride.
+ * Top left first. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vhaddq_s16(b0, b2);
+ const int16x8_t c1 = vhaddq_s16(b1, b3);
+ const int16x8_t c2 = vhsubq_s16(b0, b2);
+ const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 256, c1);
+ store_s16q_to_tran_low(coeff + 512, c2);
+ store_s16q_to_tran_low(coeff + 768, c3);
+
+ coeff += 8;
+ }
+}
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 000000000..b9f72a94c
--- /dev/null
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+ const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+ tran_low_t *dqcoeff_ptr) {
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+ const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+ int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+ const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+ const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+ const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+ // Calculate 2 masks of elements outside the bin
+ const int32x4_t zbin_mask_0 =
+ vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+ const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+ vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+ // Get the rounded values
+ const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+ const int32x4_t rounded_1 =
+ vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+ // (round * (quant << 15) * 2) >> 16 == (round * quant)
+ int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+ int32x4_t qcoeff_tmp_1 =
+ vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+ // Add rounded values
+ qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+ qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+ // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+ qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+ qcoeff_tmp_1 =
+ vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+ // Restore the sign bit.
+ qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+ qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+ // Only keep the relevant coeffs
+ *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+ *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant,
+ const int32x4_t quant_shift, const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+ int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#ifdef __aarch64__
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+ int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+ const int32x4_t quant, const int32x4_t quant_shift,
+ const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
+ int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#ifdef __aarch64__
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 000000000..ecb52ce5a
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
+ const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
+ sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
+ const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
+ sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
+ vget_high_u16(ref_u16));
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, const uint8_t *second_pred, int width, int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
+ const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
+ const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
+ const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
+ sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred_ptr += width;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, const uint8_t *second_pred, int width, int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
+ const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
+ const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
+ const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred_ptr += width;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+#define highbd_sad4MxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ }
+
+#define highbd_sadMxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ }
+
+#define highbd_sad4MxN_avg(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+ second_pred, m, n); \
+ }
+
+#define highbd_sadMxN_avg(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+ second_pred, m, n); \
+ }
+
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_neon( \
+ const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], int ref_stride, \
+ uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ }
+
+/* clang-format off */
+// 4x4
+highbd_sad4MxN(4, 4)
+highbd_sad4MxN_avg(4, 4)
+highbd_sadMxNx4D(4, 4)
+
+// 4x8
+highbd_sad4MxN(4, 8)
+highbd_sad4MxN_avg(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxN_avg(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxN_avg(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxN_avg(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxN_avg(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxN_avg(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxN_avg(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxN_avg(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxN_avg(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxN_avg(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxN_avg(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxN_avg(64, 64)
+highbd_sadMxNx4D(64, 64)
+ /* clang-format on */
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 000000000..96a35af01
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ int w, int h, uint64_t *sse,
+ int64_t *sum) {
+ int i, j;
+
+ if (w >= 8) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
+ const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
+ const int32x4_t diff1_s32 =
+ vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
+ const int32x4_t diff2_s32 =
+ vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
+ const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
+ const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff1_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff2_s32);
+ sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
+ sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_uint32x4(sse_u32);
+ } else {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ assert(w >= 4);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 4) {
+ const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
+ const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
+ const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
+ const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff_s32);
+ sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_uint32x4(sse_u32);
+ }
+}
+
+static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+
+ if (w < 32 && h < 32) {
+ highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
+ } else {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ int k, l;
+ for (k = 0; k + 16 <= h; k += 16) {
+ for (l = 0; l + 16 <= w; l += 16) {
+ uint64_t sse_tmp = 0;
+ int64_t sum_tmp = 0;
+ highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
+ 16, &sse_tmp, &sum_tmp);
+ sum_long += sum_tmp;
+ sse_long += sse_tmp;
+ }
+ src_ptr += 16 * src_stride;
+ ref_ptr += 16 * ref_stride;
+ }
+ *sum = sum_long;
+ *sse = sse_long;
+ }
+}
+
+static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)sse_long;
+ *sum = (int)sum_long;
+}
+
+static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+ uint32_t vpx_highbd_8_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ }
+
+#define HIGHBD_MSE(W, H) \
+ uint32_t vpx_highbd_8_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ }
+
+static INLINE void highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ uint32_t i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+
+ uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+ uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+ uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+ const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+ uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+ uint16x4_t out1_u16;
+ uint16x4_t out2_u16;
+ sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+ sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+ out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+ out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+ vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ } else {
+ assert(output_width >= 4);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+ const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+ uint16x4_t out_u16;
+ sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+ out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+ vst1_u16(&output_ptr[j], out_u16);
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+static INLINE void highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ uint32_t i, j;
+
+ uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+ uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+ uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+ const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+ uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+ uint16x4_t out1_u16;
+ uint16x4_t out2_u16;
+ sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+ sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+ out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+ out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+ vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ } else {
+ assert(output_width >= 4);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+ const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+ uint16x4_t out_u16;
+ sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+ out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+ vst1_u16(&output_ptr[j], out_u16);
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
+ }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
+ }
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ uint32x4_t one_u32 = vdupq_n_u32(1);
+ if (width >= 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+ const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+ const uint32x4_t sum1_u32 =
+ vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+ const uint32x4_t sum2_u32 =
+ vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+ const uint16x4_t sum1_u16 =
+ vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+ const uint16x4_t sum2_u16 =
+ vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+ const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+ vst1q_u16(&comp_pred[j], vcomp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else {
+ assert(width >= 4);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+ const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+ const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+ const uint16x4_t vcomp_pred =
+ vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+ vst1_u16(&comp_pred[j], vcomp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ }
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 50aaa94fe..19cfc7c7f 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x2_t a_u32 = vdup_n_u32(0);
+ uint32x2_t a_u32;
if (stride == 4) return vld1_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vset_lane_u32(a, a_u32, 0);
+ a_u32 = vdup_n_u32(a);
memcpy(&a, buf, 4);
a_u32 = vset_lane_u32(a, a_u32, 1);
return vreinterpret_u8_u32(a_u32);
@@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x4_t a_u32 = vdupq_n_u32(0);
+ uint32x4_t a_u32;
if (stride == 4) return vld1q_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vsetq_lane_u32(a, a_u32, 0);
+ a_u32 = vdupq_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
buf += stride;
vst1_lane_u32((uint32_t *)buf, a_u32, 1);
}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t s4, const uint8x8_t s5,
+ const uint8x8_t s6, const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3,
+ const uint8x16_t s4, const uint8x16_t s5,
+ const uint8x16_t s6, const uint8x16_t s7) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+}
+
#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index bd7818a07..9c227d560 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -17,20 +17,57 @@
static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
const int16x8_t dequant,
- tran_low_t *dqcoeff) {
+ tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
const int32x4_t dqcoeff_0 =
vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
const int32x4_t dqcoeff_1 =
vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-#if CONFIG_VP9_HIGHBITDEPTH
- vst1q_s32(dqcoeff, dqcoeff_0);
- vst1q_s32(dqcoeff + 4, dqcoeff_1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
#else
- vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+ vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
@@ -38,109 +75,59 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
- const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
- (void)scan;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vld1q_s16(zbin_ptr);
+ int16x8_t round = vld1q_s16(round_ptr);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
// Process first 8 values which include a dc component.
{
- // Only the first element of each vector is DC.
- const int16x8_t zbin = vld1q_s16(zbin_ptr);
- const int16x8_t round = vld1q_s16(round_ptr);
- const int16x8_t quant = vld1q_s16(quant_ptr);
- const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
- const int16x8_t dequant = vld1q_s16(dequant_ptr);
- // Add one because the eob does not index from 0.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
- qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+ quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
n_coeffs -= 8;
{
- const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
- const int16x8_t round = vdupq_n_s16(round_ptr[1]);
- const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
- const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
- const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
do {
- // Add one because the eob is not its index.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
- qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
-
n_coeffs -= 8;
} while (n_coeffs > 0);
}
@@ -156,6 +143,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)scan;
}
static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -164,7 +154,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
const int16x8_t dequant,
- tran_low_t *dqcoeff) {
+ tran_low_t *dqcoeff_ptr) {
int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
int32x4_t dqcoeff_1 =
vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -176,14 +166,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
#if CONFIG_VP9_HIGHBITDEPTH
dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
- vst1q_s32(dqcoeff, dqcoeff_0);
- vst1q_s32(dqcoeff + 4, dqcoeff_1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
#else
- vst1q_s16(dqcoeff,
+ vst1q_s16(dqcoeff_ptr,
vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -194,107 +221,57 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
int i;
- (void)scan;
- (void)n_coeffs; // Because we will always calculate 32*32.
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
// Process first 8 values which include a dc component.
{
- // Only the first element of each vector is DC.
- const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
- const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- const int16x8_t quant = vld1q_s16(quant_ptr);
- const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
- const int16x8_t dequant = vld1q_s16(dequant_ptr);
- // Add one because the eob does not index from 0.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
- qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
{
- const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
- const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
- const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
- const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
- const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
for (i = 1; i < 32 * 32 / 8; ++i) {
- // Add one because the eob is not its index.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
- qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
}
@@ -310,4 +287,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
}
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 03f716c3d..5fc621aee 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -20,9 +20,9 @@
static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
const void *const buf1) {
uint32_t a;
- uint32x2_t aa = vdup_n_u32(0);
+ uint32x2_t aa;
memcpy(&a, buf0, 4);
- aa = vset_lane_u32(a, aa, 0);
+ aa = vdup_n_u32(a);
memcpy(&a, buf1, 4);
aa = vset_lane_u32(a, aa, 1);
return vreinterpret_u8_u32(aa);
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint32x4_t *const sum) {
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint16x8_t *const sum) {
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
sad_512_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_2048_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_4096_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index b1509d883..ad575d4aa 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -21,9 +21,15 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+#if defined(__ARM_FEATURE_DOTPROD)
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+ const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+ return horizontal_add_uint32x4(dp);
+#else
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -33,13 +39,34 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+#if defined(__ARM_FEATURE_DOTPROD)
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
+ const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+ return horizontal_add_uint32x4(prod);
+#else
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
+#if defined(__ARM_FEATURE_DOTPROD)
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t src2_u8 =
+ load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+ const uint8x16_t ref2_u8 =
+ load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+ const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
+ const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
+ prod = vdotq_u32(prod, sad1_u8, ones);
+ prod = vdotq_u32(prod, sad2_u8, ones);
+ return horizontal_add_uint32x4(prod);
+#else
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < 8; i += 4) {
@@ -52,11 +79,31 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
}
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const uint8_t *second_pred) {
+#if defined(__ARM_FEATURE_DOTPROD)
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t src2_u8 =
+ load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+ const uint8x16_t ref2_u8 =
+ load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+ const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
+ const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
+ const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
+ const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
+ const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
+ const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
+ prod = vdotq_u32(prod, sad1_u8, ones);
+ prod = vdotq_u32(prod, sad2_u8, ones);
+ return horizontal_add_uint32x4(prod);
+#else
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < 8; i += 4) {
@@ -72,8 +119,65 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
}
return horizontal_add_uint16x8(abs);
+#endif
}
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x2_t prod = vdup_n_u32(0);
+ const uint8x8_t ones = vdup_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdot_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x2_t prod = vdup_n_u32(0);
+ const uint8x8_t ones = vdup_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ const uint8x8_t c_u8 = vld1_u8(second_pred);
+ const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+ const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ prod = vdot_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD8XN(n) \
+ uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x2_t prod = \
+ sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x2(prod); \
+ } \
+ \
+ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x2_t prod = \
+ sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x2(prod); \
+ }
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -124,11 +228,67 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD8XN(4)
SAD8XN(8)
SAD8XN(16)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t src_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+ const uint8x16_t c_u8 = vld1q_u8(second_pred);
+ const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+ const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ prod = vdotq_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD16XN(n) \
+ uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x4_t prod = \
+ sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x4(prod); \
+ } \
+ \
+ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t prod = \
+ sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x4(prod); \
+ }
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -182,11 +342,78 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD16XN(8)
SAD16XN(16)
SAD16XN(32)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
+ const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_lo_u8, ones);
+ prod = vdotq_u32(prod, sad_hi_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t c_lo = vld1q_u8(second_pred);
+ const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
+ const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+ const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+ const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
+ const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ prod = vdotq_u32(prod, sad_lo_u8, ones);
+ prod = vdotq_u32(prod, sad_hi_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD32XN(n) \
+ uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x4_t prod = \
+ sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x4(prod); \
+ } \
+ \
+ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t prod = \
+ sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x4(prod); \
+ }
+
+#else // defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -250,11 +477,81 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD32XN(16)
SAD32XN(32)
SAD32XN(64)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
+ const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
+ const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
+ const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_0_u8, ones);
+ prod = vdotq_u32(prod, sad_1_u8, ones);
+ prod = vdotq_u32(prod, sad_2_u8, ones);
+ prod = vdotq_u32(prod, sad_3_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ const uint8x16_t c_0 = vld1q_u8(second_pred);
+ const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+ const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+ const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
+ const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+ const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+ const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+ const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+ const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
+ const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
+ const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
+ const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ prod = vdotq_u32(prod, sad_0_u8, ones);
+ prod = vdotq_u32(prod, sad_1_u8, ones);
+ prod = vdotq_u32(prod, sad_2_u8, ones);
+ prod = vdotq_u32(prod, sad_3_u8, ones);
+ }
+ return prod;
+}
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -332,6 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
return vpadalq_u16(sum, abs_1);
}
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
#define SAD64XN(n) \
uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index a3befdc34..9328c3ed8 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -17,168 +17,474 @@
#include "vpx_dsp/variance.h"
#include "vpx_dsp/arm/mem_neon.h"
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
-};
-
// Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; i += 2) {
- const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
- const uint8x8_t src_1 =
- load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(output_ptr, out);
- src_ptr += 2 * src_pixels_per_line;
- output_ptr += 8;
- }
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
}
// Process a block exactly 8 wide and any height.
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; ++i) {
- const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
- const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(output_ptr, out);
- src_ptr += src_pixels_per_line;
- output_ptr += 8;
- }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
}
// Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 16) {
- const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
- const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
- const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
- const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
- const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
- const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
- const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
- const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
- }
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+ uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+ vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
}
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_VARIANCENXM(n, m) \
- uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
- uint8_t temp1[n * m]; \
- \
- if (n == 4) { \
- var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else if (n == 8) { \
- var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else { \
- var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
- bilinear_filters[y_offset]); \
- } \
- return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
}
-SUB_PIXEL_VARIANCENXM(4, 4)
-SUB_PIXEL_VARIANCENXM(4, 8)
-SUB_PIXEL_VARIANCENXM(8, 4)
-SUB_PIXEL_VARIANCENXM(8, 8)
-SUB_PIXEL_VARIANCENXM(8, 16)
-SUB_PIXEL_VARIANCENXM(16, 8)
-SUB_PIXEL_VARIANCENXM(16, 16)
-SUB_PIXEL_VARIANCENXM(16, 32)
-SUB_PIXEL_VARIANCENXM(32, 16)
-SUB_PIXEL_VARIANCENXM(32, 32)
-SUB_PIXEL_VARIANCENXM(32, 64)
-SUB_PIXEL_VARIANCENXM(64, 32)
-SUB_PIXEL_VARIANCENXM(64, 64)
-
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \
- uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
- uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
- uint8_t temp1[n * m]; \
- \
- if (n == 4) { \
- var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else if (n == 8) { \
- var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else { \
- var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
- bilinear_filters[y_offset]); \
- } \
- \
- vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \
- \
- return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
}
-SUB_PIXEL_AVG_VARIANCENXM(4, 4)
-SUB_PIXEL_AVG_VARIANCENXM(4, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 4)
-SUB_PIXEL_AVG_VARIANCENXM(8, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 8)
-SUB_PIXEL_AVG_VARIANCENXM(16, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 16)
-SUB_PIXEL_AVG_VARIANCENXM(32, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 64)
-SUB_PIXEL_AVG_VARIANCENXM(64, 32)
-SUB_PIXEL_AVG_VARIANCENXM(64, 64)
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index 612897e24..2c008e48a 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
} while (r);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ int r = rows, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+
+ if (cols >= 16) {
+ do {
+ for (c = 0; c < cols; c += 16) {
+ const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t d0 = vsubq_u16(s0, p0);
+ const uint16x8_t d1 = vsubq_u16(s1, p1);
+ vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 8) {
+ do {
+ for (c = 0; c < cols; c += 8) {
+ const uint16x8_t s = vld1q_u16(&src[c]);
+ const uint16x8_t p = vld1q_u16(&pred[c]);
+ const uint16x8_t d0 = vsubq_u16(s, p);
+ vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 4) {
+ do {
+ for (c = 0; c < cols; c += 4) {
+ const uint16x4_t s = vld1_u16(&src[c]);
+ const uint16x4_t p = vld1_u16(&pred[c]);
+ const uint16x4_t v_diff = vsub_u16(s, p);
+ vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index c098ad31b..41d44f2b1 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
@@ -787,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
a7->val[1] = c7.val[1];
}
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+ int32x4_t *right /*[8]*/,
+ int32x4_t *out_left /*[8]*/,
+ int32x4_t *out_right /*[8]*/) {
+ int32x4x2_t out[8];
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ out_left[0] = out[0].val[0];
+ out_left[1] = out[1].val[0];
+ out_left[2] = out[2].val[0];
+ out_left[3] = out[3].val[0];
+ out_left[4] = out[4].val[0];
+ out_left[5] = out[5].val[0];
+ out_left[6] = out[6].val[0];
+ out_left[7] = out[7].val[0];
+ out_right[0] = out[0].val[1];
+ out_right[1] = out[1].val[1];
+ out_right[2] = out[2].val[1];
+ out_right[3] = out[3].val[1];
+ out_right[4] = out[4].val[1];
+ out_right[5] = out[5].val[1];
+ out_right[6] = out[6].val[1];
+ out_right[7] = out[7].val[1];
+}
+
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 7b93f142b..3ccc4e807 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,345 +19,357 @@
#include "vpx_dsp/arm/sum_neon.h"
#include "vpx_ports/mem.h"
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i;
- uint32x4_t sum_a = vdupq_n_u32(0);
- uint32x4_t sum_b = vdupq_n_u32(0);
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
uint32x4_t sse_u32 = vdupq_n_u32(0);
- for (i = 0; i < h; i += 4) {
- const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
- const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+ int i = h;
+ do {
+ const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
- const uint8x16_t abs_diff = vabdq_u8(a, b);
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
- sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
- sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
src_ptr += 4 * src_stride;
ref_ptr += 4 * ref_stride;
- }
+ i -= 4;
+ } while (i != 0);
- *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
*sse = horizontal_add_uint32x4(sse_u32);
}
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int w,
- int h, uint32_t *sse, int *sum) {
- int i, j;
- uint32x4_t sum_a = vdupq_n_u32(0);
- uint32x4_t sum_b = vdupq_n_u32(0);
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
uint32x4_t sse_u32 = vdupq_n_u32(0);
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const uint8x16_t a = vld1q_u8(src_ptr + j);
- const uint8x16_t b = vld1q_u8(ref_ptr + j);
+ int i = h;
+ do {
+ const uint8x16_t s =
+ vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+ const uint8x16_t r =
+ vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
- const uint8x16_t abs_diff = vabdq_u8(a, b);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
- sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
- sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
- }
src_ptr += src_stride;
ref_ptr += ref_stride;
- }
+ } while (--i != 0);
- *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
*sse = horizontal_add_uint32x4(sse_u32);
}
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i = 0;
- uint32x2_t sum_a = vdup_n_u32(0);
- uint32x2_t sum_b = vdup_n_u32(0);
- uint32x2_t sse_lo_u32 = vdup_n_u32(0);
- uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int i = h;
do {
- const uint8x8_t a_0 = vld1_u8(src_ptr);
- const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
- const uint8x8_t b_0 = vld1_u8(ref_ptr);
- const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
-
- const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
- const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
- sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
- sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
-
- sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
- sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
- sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
- sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
-
- src_ptr += src_stride + src_stride;
- ref_ptr += ref_stride + ref_stride;
- i += 2;
- } while (i < h);
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ j += 16;
+ } while (j < w);
- *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
- *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
}
-#else
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i;
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int i = h;
- // Since width is only 4, sum_s16 only loads a half row per loop.
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
assert(h <= 256);
- for (i = 0; i < h; i += 4) {
- const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
- const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
- const uint16x8_t diff_lo_u16 =
- vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
- const uint16x8_t diff_hi_u16 =
- vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
- const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
- const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
- sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+ do {
+ const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
- vget_low_s16(diff_lo_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
- vget_high_s16(diff_lo_s16));
+ sum_s16 = vaddq_s16(sum_s16, diff);
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
- vget_low_s16(diff_hi_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
- vget_high_s16(diff_hi_s16));
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
- src_ptr += 4 * src_stride;
- ref_ptr += 4 * ref_stride;
- }
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
*sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
}
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int w,
- int h, uint32_t *sse, int *sum) {
- int i, j;
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
-
- // The loop loads 16 values at a time but doubles them up when accumulating
- // into sum_s16.
- assert(w / 8 * h <= 128);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
- const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
-
- const uint16x8_t diff_lo_u16 =
- vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
- const uint16x8_t diff_hi_u16 =
- vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
- const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
- const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
- sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
- vget_low_s16(diff_lo_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
- vget_high_s16(diff_lo_s16));
-
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
- vget_low_s16(diff_hi_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
- vget_high_s16(diff_hi_s16));
- }
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ do {
+ const uint8x8_t s = vld1_u8(src_ptr);
+ const uint8x8_t r = vld1_u8(ref_ptr);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
src_ptr += src_stride;
ref_ptr += ref_stride;
- }
+ } while (--i != 0);
*sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
}
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i = 0;
- int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
- // Each column has it's own accumulator entry in sum_s16.
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
assert(h <= 128);
do {
- const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
- const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
- const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
- const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
- const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
- const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
- const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
- const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
- sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
- vget_low_s16(diff_0_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
- vget_low_s16(diff_1_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
- vget_high_s16(diff_0_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
- vget_high_s16(diff_1_s16));
- src_ptr += src_stride + src_stride;
- ref_ptr += ref_stride + ref_stride;
- i += 2;
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, int h_limit,
+ unsigned int *sse, int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
} while (i < h);
- *sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
}
-#endif
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+ variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
}
void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+ variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
}
-#define VARIANCENXM(n, m, shift) \
- unsigned int vpx_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride, unsigned int *sse) { \
- int sum; \
- if (n == 4) \
- variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \
- &sum); \
- else if (n == 8) \
- variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \
- &sum); \
- else \
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
- &sum); \
- if (n * m < 16 * 16) \
- return *sse - ((sum * sum) >> shift); \
- else \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int vpx_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
}
-VARIANCENXM(4, 4, 4)
-VARIANCENXM(4, 8, 5)
-VARIANCENXM(8, 4, 5)
-VARIANCENXM(8, 8, 6)
-VARIANCENXM(8, 16, 7)
-VARIANCENXM(16, 8, 7)
-VARIANCENXM(16, 16, 8)
-VARIANCENXM(16, 32, 9)
-VARIANCENXM(32, 16, 9)
-VARIANCENXM(32, 32, 10)
-
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
- ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
- ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
-
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
- ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
- ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
- ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
@@ -421,7 +433,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return vget_lane_u32(sse, 0);
}
-#else
+#else // !defined(__ARM_FEATURE_DOTPROD)
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +530,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
}
-#endif
+#endif // defined(__ARM_FEATURE_DOTPROD)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 06b58c438..b4cdd58c7 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,8 +31,9 @@
// instructions. This optimization is much faster in speed unit test, but slowed
// down the whole decoder by 5%.
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
@@ -53,9 +54,176 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
- int8x8_t *a2, int8x8_t *a3,
- int8x16_t *b,
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
const uint8x16_t permute_tbl) {
/* Transpose 8-bit elements and concatenate result rows as follows:
* a0: 00, 01, 02, 03, XX, XX, XX, XX
@@ -70,13 +238,13 @@ static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
* inline helper is called many times from the same parent function.
*/
- int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
}
-static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
- int8x8_t *a2, int8x8_t *a3,
- int8x16_t *b0, int8x16_t *b1,
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
const uint8x16x2_t permute_tbl) {
/* Transpose 8-bit elements and concatenate result rows as follows:
* a0: 00, 01, 02, 03, 04, 05, 06, 07
@@ -92,11 +260,364 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
* inline helper is called many times from the same parent function.
*/
- int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
}
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#else // !defined(__ARM_FEATURE_MATMUL_INT8)
+
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
@@ -125,33 +646,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int16x8_t t01, t23;
uint8x8_t d01, d23;
- s0 = vld1q_u8(src);
- src += src_stride;
- s1 = vld1q_u8(src);
- src += src_stride;
- s2 = vld1q_u8(src);
- src += src_stride;
- s3 = vld1q_u8(src);
- src += src_stride;
-
- t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
d01 = vqrshrun_n_s16(t01, 7);
d23 = vqrshrun_n_s16(t23, 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -166,20 +676,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
- d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
- d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
- d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
@@ -222,20 +730,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
dd01 = vdup_n_u8(0);
dd23 = vdup_n_u8(0);
- s0 = vld1q_u8(src);
- src += src_stride;
- s1 = vld1q_u8(src);
- src += src_stride;
- s2 = vld1q_u8(src);
- src += src_stride;
- s3 = vld1q_u8(src);
- src += src_stride;
-
- t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
d01 = vqrshrun_n_s16(t01, 7);
@@ -243,17 +743,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -268,29 +766,25 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
-
- d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
- d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
- d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
- d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
-
- dd0 = vld1_u8(d + 0 * dst_stride);
- dd1 = vld1_u8(d + 1 * dst_stride);
- dd2 = vld1_u8(d + 2 * dst_stride);
- dd3 = vld1_u8(d + 3 * dst_stride);
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
d0 = vrhadd_u8(d0, dd0);
d1 = vrhadd_u8(d1, dd1);
d2 = vrhadd_u8(d2, dd2);
d3 = vrhadd_u8(d3, dd3);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
@@ -303,6 +797,49 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
@@ -333,14 +870,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int32x4_t d0, d1, d2, d3;
uint8x8_t d01, d23;
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- src += 4 * src_stride;
- t4 = vld1_u8(src);
- src += src_stride;
- t5 = vld1_u8(src);
- src += src_stride;
- t6 = vld1_u8(src);
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -357,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
do {
uint8x8_t t7, t8, t9, t10;
@@ -375,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
/* Merge new data into block from previous iteration. */
samples_LUT.val[0] = s3456;
@@ -384,22 +915,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -409,6 +933,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s3456 = s78910;
src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -426,14 +951,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s += 4 * src_stride;
- t4 = vld1_u8(s);
- s += src_stride;
- t5 = vld1_u8(s);
- s += src_stride;
- t6 = vld1_u8(s);
- s += src_stride;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -450,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
tran_concat_tbl);
do {
@@ -475,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
tran_concat_tbl);
/* Merge new data into block from previous iteration. */
@@ -491,18 +1010,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -556,14 +1073,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int32x4_t d0, d1, d2, d3;
uint8x8_t d01, d23, dd01, dd23;
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- src += 4 * src_stride;
- t4 = vld1_u8(src);
- src += src_stride;
- t5 = vld1_u8(src);
- src += src_stride;
- t6 = vld1_u8(src);
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -580,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
do {
uint8x8_t t7, t8, t9, t10;
@@ -598,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
/* Merge new data into block from previous iteration. */
samples_LUT.val[0] = s3456;
@@ -607,27 +1118,21 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -637,6 +1142,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s3456 = s78910;
src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -654,14 +1160,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s += 4 * src_stride;
- t4 = vld1_u8(s);
- s += src_stride;
- t5 = vld1_u8(s);
- s += src_stride;
- t6 = vld1_u8(s);
- s += src_stride;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -678,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
tran_concat_tbl);
do {
@@ -703,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
tran_concat_tbl);
/* Merge new data into block from previous iteration. */
@@ -719,28 +1219,23 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
-
- dd0 = vld1_u8(d + 0 * dst_stride);
- dd1 = vld1_u8(d + 1 * dst_stride);
- dd2 = vld1_u8(d + 2 * dst_stride);
- dd3 = vld1_u8(d + 3 * dst_stride);
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
d0 = vrhadd_u8(d0, dd0);
d1 = vrhadd_u8(d1, dd1);
d2 = vrhadd_u8(d2, dd2);
d3 = vrhadd_u8(d3, dd3);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -764,29 +1259,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-#else
-
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
- const uint8x8_t s0, const uint8x8_t s1,
- const uint8x8_t s2, const uint8x8_t s3,
- const uint8x8_t s4, const uint8x8_t s5,
- const uint8x8_t s6, const uint8x8_t s7) {
- vst1_u8(s, s0);
- s += p;
- vst1_u8(s, s1);
- s += p;
- vst1_u8(s, s2);
- s += p;
- vst1_u8(s, s3);
- s += p;
- vst1_u8(s, s4);
- s += p;
- vst1_u8(s, s5);
- s += p;
- vst1_u8(s, s6);
- s += p;
- vst1_u8(s, s7);
-}
+#endif // defined(__ARM_FEATURE_MATMUL_INT8)
+
+#else // !(defined(__aarch64__) &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8)))
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -808,16 +1285,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
if (h == 4) {
uint8x8_t d01, d23;
- int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
- d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
int16x8_t tt0, tt1, tt2, tt3;
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -849,14 +1323,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vget_low_s16(tt2);
s10 = vget_low_s16(tt3);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -883,8 +1353,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
@@ -927,14 +1395,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
@@ -1002,22 +1466,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
- filter4);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
- filter4);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
- filter4);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
- filter3, filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
@@ -1061,8 +1517,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
if (h == 4) {
uint8x8_t d01, d23;
- int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
- d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
int16x8_t tt0, tt1, tt2, tt3;
uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1070,8 +1525,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1103,14 +1556,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vget_low_s16(tt2);
s10 = vget_low_s16(tt3);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1140,8 +1589,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
@@ -1186,14 +1633,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1276,22 +1719,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
- filter4);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
- filter4);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
- filter4);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
- filter3, filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1349,8 +1784,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -1387,14 +1820,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1417,8 +1846,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int height;
const uint8_t *s;
uint8_t *d;
@@ -1469,14 +1896,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
vst1_u8(d, t0);
d += dst_stride;
@@ -1521,8 +1944,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1560,14 +1981,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1598,8 +2015,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int height;
const uint8_t *s;
uint8_t *d;
@@ -1651,14 +2066,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vcombine_u8(t0, t1);
d23 = vcombine_u8(t2, t3);
@@ -1694,4 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-#endif
+#endif // #if defined(__aarch64__) &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 857b6d54e..ed7f18053 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,69 +16,12 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
- uint8x8_t *const s0, uint8x8_t *const s1,
- uint8x8_t *const s2, uint8x8_t *const s3) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
-}
-
-static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
- uint8x8_t *const s0, uint8x8_t *const s1,
- uint8x8_t *const s2, uint8x8_t *const s3,
- uint8x8_t *const s4, uint8x8_t *const s5,
- uint8x8_t *const s6, uint8x8_t *const s7) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
- s += p;
- *s4 = vld1_u8(s);
- s += p;
- *s5 = vld1_u8(s);
- s += p;
- *s6 = vld1_u8(s);
- s += p;
- *s7 = vld1_u8(s);
-}
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
- uint8x16_t *const s0, uint8x16_t *const s1,
- uint8x16_t *const s2, uint8x16_t *const s3,
- uint8x16_t *const s4, uint8x16_t *const s5,
- uint8x16_t *const s6, uint8x16_t *const s7) {
- *s0 = vld1q_u8(s);
- s += p;
- *s1 = vld1q_u8(s);
- s += p;
- *s2 = vld1q_u8(s);
- s += p;
- *s3 = vld1q_u8(s);
- s += p;
- *s4 = vld1q_u8(s);
- s += p;
- *s5 = vld1q_u8(s);
- s += p;
- *s6 = vld1q_u8(s);
- s += p;
- *s7 = vld1q_u8(s);
-}
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
-
-static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
/* Sample range-clamping and permutation are performed by the caller. */
int32x4_t sum;
@@ -90,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
return sum;
}
-static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
int8x16_t clamped_samples, permuted_samples[2];
int32x4_t sum;
@@ -115,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
return sum;
}
-static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
/* Sample range-clamping and permutation are performed by the caller. */
int32x4_t sum0, sum1;
int16x8_t sum;
@@ -138,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
return vqrshrun_n_s16(sum, 7);
}
-static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
int8x16_t clamped_samples, permuted_samples[3];
int32x4_t sum0, sum1;
int16x8_t sum;
@@ -171,15 +114,98 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
return vqrshrun_n_s16(sum, 7);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filters,
- const int16x4_t filter3,
- const int16x4_t filter4) {
+ const int16x8_t filters) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
int16x4_t sum;
@@ -190,8 +216,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
sum = vmla_lane_s16(sum, s5, filters_hi, 1);
sum = vmla_lane_s16(sum, s6, filters_hi, 2);
sum = vmla_lane_s16(sum, s7, filters_hi, 3);
- sum = vqadd_s16(sum, vmul_s16(s3, filter3));
- sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
return sum;
}
@@ -199,9 +225,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filters,
- const int16x8_t filter3,
- const int16x8_t filter4) {
+ const int16x8_t filters) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
int16x8_t sum;
@@ -212,15 +236,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
- sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
- sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
return vqrshrun_n_s16(sum, 7);
}
static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
const int16x8_t filters) {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int16x8_t ss[8];
ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -233,7 +255,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
- filters, filter3, filter4);
+ filters);
}
#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8edf8a66e..b8e3c5e54 100644
--- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_ports/mem.h"
@@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4(
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
if (x_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x8_t ss[4];
int16x4_t t[8], tt;
@@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4(
t[7] = vget_high_s16(ss[3]);
tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters, filter3, filter4);
+ filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
} else {
@@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4(
if (y_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x4_t t[8], tt;
@@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4(
t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
- filter3, filter4);
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
} else {
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 1c45e8a73..954015407 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+
+#include <assert.h>
#include <stdlib.h>
#include "./vpx_dsp_rtcd.h"
@@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
const int ref_stride, const int height) {
int idx;
const int norm_factor = height >> 1;
+ assert(height >= 2);
for (idx = 0; idx < 16; ++idx) {
int i;
hbuf[idx] = 0;
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index 04084af8f..5f1ee69ec 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -13,6 +13,7 @@
#include <stdio.h>
+#include "vpx_ports/compiler_attributes.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/prob.h"
@@ -35,7 +36,9 @@ typedef struct vpx_writer {
void vpx_start_encode(vpx_writer *br, uint8_t *source);
void vpx_stop_encode(vpx_writer *br);
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+ int bit,
+ int probability) {
unsigned int split;
int count = br->count;
unsigned int range = br->range;
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index 2fc33b06b..77be0bb4f 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
}
static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
- __m128i zbin_mask0, __m128i zbin_mask1,
const int16_t *scan, int index,
__m128i zero) {
const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
@@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
__m128i scan1 = __lsx_vld(scan + index + 8, 0);
__m128i eob0, eob1;
- scan0 = __lsx_vsub_h(scan0, zbin_mask0);
- scan1 = __lsx_vsub_h(scan1, zbin_mask1);
eob0 = __lsx_vandn_v(zero_coeff0, scan0);
eob1 = __lsx_vandn_v(zero_coeff1, scan1);
return __lsx_vmax_h(eob0, eob1);
@@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
dequant = __lsx_vilvh_d(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
eob = __lsx_vmax_h(eob, eob0);
index += 16;
@@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
dequant = __lsx_vilvh_d(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
// AC only loop.
for (index = 16; index < 32 * 32; index += 16) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
dqcoeff_ptr + 8 + index);
- eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
eob = __lsx_vmax_h(eob, eob0);
}
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 995602831..d6504aab1 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint8_t *op3, uint8_t *op2, uint8_t *op1,
uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
uint8_t *oq2, uint8_t *oq3) {
@@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint8_t *op7, uint8_t *op6,
uint8_t *op5, uint8_t *op4, uint8_t *op3,
uint8_t *op2, uint8_t *op1, uint8_t *op0,
uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
bd);
}
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint16_t *op3, uint16_t *op2, uint16_t *op1,
uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
bd);
}
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint16_t *op7, uint16_t *op6,
uint16_t *op5, uint16_t *op4, uint16_t *op3,
uint16_t *op2, uint16_t *op1, uint16_t *op0,
uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 3c2f50c79..d54ce5368 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -83,31 +83,33 @@
val_lh_m; \
})
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
- uint32_t val_lw_m; \
- \
- __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
- "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
- : [val_lw_m] "=&r"(val_lw_m) \
- : [psrc_lw_m] "r"(psrc_lw_m)); \
- \
- val_lw_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+ uint32_t val_lw_m; \
+ \
+ __asm__ __volatile__( \
+ "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+ : [val_lw_m] "=&r"(val_lw_m) \
+ : [psrc_lw_m] "r"(psrc_lw_m)); \
+ \
+ val_lw_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
- uint64_t val_ld_m = 0; \
- \
- __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
- "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
- : [val_ld_m] "=&r"(val_ld_m) \
- : [psrc_ld_m] "r"(psrc_ld_m)); \
- \
- val_ld_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint64_t val_ld_m = 0; \
+ \
+ __asm__ __volatile__( \
+ "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+ : [val_ld_m] "=&r"(val_ld_m) \
+ : [psrc_ld_m] "r"(psrc_ld_m)); \
+ \
+ val_ld_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index 7cdcbeb40..ab71f6e23 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
}
-static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
const int16_t *iscan_ptr, int index) {
int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
- scan = vec_sub(scan, mask);
return vec_andc(scan, zero_coeff);
}
@@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
- eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
- nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
if (n_coeffs > 16) {
int index = 16;
@@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
- eob =
- vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
- eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
- nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
eob = vec_max(eob, eob2);
index += 24;
@@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = vec_splat(dequant, 1); // remove DC from dequant
vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
- eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
- nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
do {
int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
@@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
- eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
- eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
- nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
eob = vec_max(eob, eob2);
// 24 int16_t is 48 bytes
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 48bac0450..f0d4e927a 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
* and highbd_8_variance(). It should not.
*/
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, unsigned int *sse,
- int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
int i, j;
-
- *sum = 0;
- *sse = 0;
+ int64_t sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
+ sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
+
+ return sse;
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h) {
int i, j;
+ int64_t sse = 0;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
+ sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
-}
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
- &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
+ return sse;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
const int dw = width % 16;
const int dh = height % 16;
int64_t total_sse = 0;
- unsigned int sse = 0;
- int sum = 0;
int x, y;
if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
- height, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height);
}
if (dh > 0) {
- encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride, width - dw, dh,
- &sse, &sum);
- total_sse += sse;
+ total_sse +=
+ encoder_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
+ unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
@@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
int x, y;
const int dw = width % 16;
const int dh = height % 16;
- unsigned int sse = 0;
- int sum = 0;
if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
- b_stride, dw, height, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height);
}
if (dh > 0) {
- encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
+ unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 30b55dcb4..ce1e8382b 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
- int width, int height, const uint16_t *ref,
- int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 13999af04..1fd9495cf 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -226,19 +226,19 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(VPX_ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
-DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c
endif # !CONFIG_VP9_HIGHBITDEPTH
@@ -326,11 +326,14 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c
DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c
DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c
endif
# avg
@@ -374,6 +377,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c
DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
@@ -388,6 +392,9 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS
@@ -425,6 +432,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d3c668f9a..8725821b6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -527,6 +527,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2 neon/;
+ specialize qw/vpx_highbd_fdct4x4_1 neon/;
+ $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8 neon sse2/;
@@ -550,27 +552,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct32x32_1 sse2 neon/;
add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct4x4 sse2/;
+ specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct8x8 sse2/;
+ specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct8x8_1 neon/;
$vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct16x16 sse2/;
+ specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct16x16_1 neon/;
add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32 sse2/;
+ specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+ specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct32x32_1 neon/;
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
@@ -711,17 +715,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
+ specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
+ specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b sse2/;
+ specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+ specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER
@@ -730,7 +734,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
#
# Single block SAD
@@ -795,7 +799,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_highbd_hadamard_8x8 avx2/;
@@ -819,7 +823,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon msa/;
@@ -935,46 +939,49 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vpx_highbd_subtract_block neon avx2/;
#
# Single block SAD
#
add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad64x64 sse2/;
+ specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad64x32 sse2/;
+ specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x64 sse2/;
+ specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x32 sse2/;
+ specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x16 sse2/;
+ specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x32 sse2/;
+ specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x16 sse2/;
+ specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x8 sse2/;
+ specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x16 sse2/;
+ specialize qw/vpx_highbd_sad8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x8 sse2/;
+ specialize qw/vpx_highbd_sad8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x4 sse2/;
+ specialize qw/vpx_highbd_sad8x4 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x8 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x4 neon/;
#
# Avg
@@ -988,83 +995,85 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad64x64_avg sse2/;
+ specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad64x32_avg sse2/;
+ specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x64_avg sse2/;
+ specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x32_avg sse2/;
+ specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x16_avg sse2/;
+ specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x32_avg sse2/;
+ specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x16_avg sse2/;
+ specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x8_avg sse2/;
+ specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x16_avg sse2/;
+ specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x8_avg sse2/;
+ specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x4_avg sse2/;
+ specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x8_avg neon/;
add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x4_avg neon/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x64x4d sse2/;
+ specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x32x4d sse2/;
+ specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x64x4d sse2/;
+ specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x32x4d sse2/;
+ specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x16x4d sse2/;
+ specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x32x4d sse2/;
+ specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x16x4d sse2/;
+ specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x8x4d sse2/;
+ specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x16x4d sse2/;
+ specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x8x4d sse2/;
+ specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x4x4d sse2/;
+ specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad4x8x4d sse2/;
+ specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad4x4x4d sse2/;
+ specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
#
# Structured Similarity (SSIM)
@@ -1232,369 +1241,397 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x4 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x4 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x4 neon/;
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get16x16var sse2/;
+ specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get8x8var sse2/;
+ specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get16x16var sse2/;
+ specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get8x8var sse2/;
+ specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get16x16var sse2/;
+ specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get8x8var sse2/;
+ specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x16 sse2/;
+ specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x8 sse2/;
+ specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x16 sse2/;
+ specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x8 sse2/;
+ specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x16 sse2/;
+ specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x8 sse2/;
+ specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+ specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
#
# Subpixel Variance
#
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
} # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 3f4f577a2..b2e01319d 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
- src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
src32[0] = _mm256_cvtepi16_epi32(src16[0]);
src32[1] = _mm256_cvtepi16_epi32(src16[1]);
@@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
- src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
hadamard_col8x2_avx2(src, 0);
hadamard_col8x2_avx2(src, 1);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 9da2f34c9..015c11a1f 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
s0 = _mm_add_epi32(s0, s1);
s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
- avg = _mm_cvtsi128_si32(s0);
+ avg = (unsigned int)_mm_cvtsi128_si32(s0);
return (avg + 32) >> 6;
}
@@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index 99bc9637f..ebee964b1 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
__m128i *const dst_ptr_2,
const __m256i *const src) {
- *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
- *((uint32_t *)(dst_ptr_2)) =
- _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+ *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+ *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
}
static INLINE __m256i mm256_round_epi32(const __m256i *const src,
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 3f158b5e4..f3a802029 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- const __m256i kZero = _mm256_set1_epi16(0);
+ const __m256i kZero = _mm256_setzero_si256();
const __m256i kOne = _mm256_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index ac1246faa..bf350b6da 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index 78cf9111d..1d07391b0 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
const int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
// Faster than _mm_set1_epi16((1 << bd) - 1).
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index d265fc1a9..9f45623de 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
__m128i lbounded;
__m128i retval;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
__m128i t80, max, min;
@@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
__m128i blimit_v, limit_v, thresh_v;
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
@@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
__m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
@@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 000000000..8edddd637
--- /dev/null
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr,
+ __m256i *qp, int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const unsigned int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 4535a0f7a..ae1981a83 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
__m128i zbins[2];
__m128i nzbins[2];
@@ -82,13 +82,14 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
const uint32_t abs_qcoeff =
(uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
- *eob_ptr = eob_i + 1;
+ *eob_ptr = eob_i;
}
void vpx_highbd_quantize_b_32x32_sse2(
@@ -101,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2(
__m128i nzbins[2];
int idx = 0;
int idx_arr[1024];
- int i, eob = -1;
+ int i, eob = 0;
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
@@ -143,10 +144,10 @@ void vpx_highbd_quantize_b_32x32_sse2(
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
- *eob_ptr = eob + 1;
+ *eob_ptr = eob;
}
#endif
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 000000000..947b5e977
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ int x;
+
+ for (x = 0; x < 4; ++x) {
+ __m256i r[4];
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+ // sum every abs diff
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+ }
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD64XNX4D(n) \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *refs[4]; \
+ __m256i sums_16[4]; \
+ __m256i sums_32[4]; \
+ int i; \
+ \
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
+ sums_32[0] = _mm256_setzero_si256(); \
+ sums_32[1] = _mm256_setzero_si256(); \
+ sums_32[2] = _mm256_setzero_si256(); \
+ sums_32[3] = _mm256_setzero_si256(); \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ sums_16[0] = _mm256_setzero_si256(); \
+ sums_16[1] = _mm256_setzero_si256(); \
+ sums_16[2] = _mm256_setzero_si256(); \
+ sums_16[3] = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32[0] = _mm256_add_epi32( \
+ sums_32[0], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[0], 1)))); \
+ sums_32[1] = _mm256_add_epi32( \
+ sums_32[1], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[1], 1)))); \
+ sums_32[2] = _mm256_add_epi32( \
+ sums_32[2], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[2], 1)))); \
+ sums_32[3] = _mm256_add_epi32( \
+ sums_32[3], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[3], 1)))); \
+ \
+ src += src_stride << 1; \
+ } \
+ calc_final_4(sums_32, sad_array); \
+ }
+
+// 64x64
+HIGHBD_SAD64XNX4D(64)
+
+// 64x32
+HIGHBD_SAD64XNX4D(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[8];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+ r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+ r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+ r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+ r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+ r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+ r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+ sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+ sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+ sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD32XNX4D(n) \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *refs[4]; \
+ __m256i sums_16[4]; \
+ __m256i sums_32[4]; \
+ int i; \
+ \
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
+ sums_32[0] = _mm256_setzero_si256(); \
+ sums_32[1] = _mm256_setzero_si256(); \
+ sums_32[2] = _mm256_setzero_si256(); \
+ sums_32[3] = _mm256_setzero_si256(); \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ sums_16[0] = _mm256_setzero_si256(); \
+ sums_16[1] = _mm256_setzero_si256(); \
+ sums_16[2] = _mm256_setzero_si256(); \
+ sums_16[3] = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32[0] = _mm256_add_epi32( \
+ sums_32[0], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[0], 1)))); \
+ sums_32[1] = _mm256_add_epi32( \
+ sums_32[1], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[1], 1)))); \
+ sums_32[2] = _mm256_add_epi32( \
+ sums_32[2], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[2], 1)))); \
+ sums_32[3] = _mm256_add_epi32( \
+ sums_32[3], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[3], 1)))); \
+ \
+ src += src_stride << 3; \
+ } \
+ calc_final_4(sums_32, sad_array); \
+ }
+
+// 32x64
+HIGHBD_SAD32XNX4D(64)
+
+// 32x32
+HIGHBD_SAD32XNX4D(32)
+
+// 32x16
+HIGHBD_SAD32XNX4D(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+ sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+ sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+ sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < 2; ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 4;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 000000000..231b67f80
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+ const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+ const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+ _mm256_extractf128_si256(t1, 1));
+ return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN(64)
+
+// 64x32
+HIGHBD_SAD64XN(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN(64)
+
+// 32x32
+HIGHBD_SAD32XN(32)
+
+// 32x16
+HIGHBD_SAD32XN(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+ const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+ const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 64;
+ }
+}
+
+#define HIGHBD_SAD64XN_AVG(n) \
+ unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ sec += 64 << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 32;
+ }
+}
+
+#define HIGHBD_SAD32XN_AVG(n) \
+ unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ sec += 32 << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ sec += 32;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sec += 16 << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 7c8d79b09..381e0ad19 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
#undef FNS
#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 4b02da966..f42b3df84 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) {
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
__m128i s[8], u[16], v[8], w[16];
// transpose
@@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
u[0] = _mm_unpacklo_epi16(in[15], in[0]);
u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index be391992a..a58fb6553 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
const unsigned char *limit,
const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
__m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
__m128i abs_p1p0;
@@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
const unsigned char *limit,
const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index 347c9fdbe..6ea34cdd1 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i limit_v =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
_mm_loadl_epi64((const __m128i *)limit));
@@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i limit_v =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
_mm_loadl_epi64((const __m128i *)limit));
@@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
@@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2(
DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit =
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
_mm_load_si128((const __m128i *)blimit1));
@@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
const __m128i thresh =
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
_mm_load_si128((const __m128i *)thresh1));
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
__m128i mask, hev, flat;
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 8b6d4d1dd..031f361a4 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -27,13 +27,13 @@ static INLINE int32_t loadu_int32(const void *src) {
}
static INLINE __m128i load_unaligned_u32(const void *a) {
- uint32_t val;
+ int val;
memcpy(&val, a, sizeof(val));
return _mm_cvtsi32_si128(val);
}
static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
- const uint32_t val = _mm_cvtsi128_si32(v);
+ const int val = _mm_cvtsi128_si32(v);
memcpy(a, &val, sizeof(val));
}
diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c
index d1029afc4..119fa7cd1 100644
--- a/vpx_dsp/x86/post_proc_sse2.c
+++ b/vpx_dsp/x86/post_proc_sse2.c
@@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
__m128i s = _mm_loadl_epi64((__m128i *)dst);
__m128i sum, sumsq_0, sumsq_1;
__m128i tmp_0, tmp_1;
- __m128i below_context;
+ __m128i below_context = _mm_setzero_si128();
s = _mm_unpacklo_epi8(s, zero);
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 706e4e641..7d8352721 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
@@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 000000000..28f7c9c7d
--- /dev/null
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift, int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+#else
+ // typedef int16_t tran_low_t;
+ return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+ // typedef int16_t tran_low_t;
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return _mm256_setzero_si256();
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+ const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+ const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+ return v_nz_mask;
+ }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+ // Do DC and first 15 AC.
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+ __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+ __m256i *v_quant_shift, __m256i *v_eobmax) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+ return *v_eobmax;
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi =
+ _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+ const __m256i v_tmp32_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_sign_lo =
+ _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i v_sign_hi =
+ _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+ const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+ const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+ dqcoeff_ptr);
+#endif
+
+ return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+ }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)n_coeffs;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 1);
+
+ // Do DC and first 15 AC.
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = (32 * 32) - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 459d95f28..9533e7916 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
@@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index afe2f924b..27bfb4e41 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
*shift = _mm_load_si128((const __m128i *)shift_ptr);
}
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr,
+ __m128i *dequant) {
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
// With ssse3 and later abs() and sign() are preferred.
static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
a = _mm_xor_si128(a, sign);
@@ -62,11 +71,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
-// zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan.
static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
- const __m128i zbin_mask0,
- const __m128i zbin_mask1,
const int16_t *scan, const int index,
const __m128i zero) {
const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
@@ -74,9 +80,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
__m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
__m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
__m128i eob0, eob1;
- // Add one to convert from indices to counts
- scan0 = _mm_sub_epi16(scan0, zbin_mask0);
- scan1 = _mm_sub_epi16(scan1, zbin_mask1);
eob0 = _mm_andnot_si128(zero_coeff0, scan0);
eob1 = _mm_andnot_si128(zero_coeff1, scan1);
return _mm_max_epi16(eob0, eob1);
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 9d2a88b7b..476230286 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
@@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
@@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
dqcoeff_ptr + 8 + index);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 3b48acd51..29bedb0e6 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -14,7 +14,7 @@
#define FSAD64_H(h) \
unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -35,8 +35,7 @@
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSAD32_H(h) \
@@ -92,7 +91,7 @@ FSAD32
unsigned int vpx_sad64x##h##_avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -118,15 +117,14 @@ FSAD32
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSADAVG32_H(h) \
unsigned int vpx_sad32x##h##_avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -156,8 +154,7 @@ FSAD32
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSADAVG64 \
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4849581ed
--- /dev/null
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+ const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+ if (cols == 64) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+ const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+ const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ const __m256i d2 = _mm256_sub_epi16(s2, p2);
+ const __m256i d3 = _mm256_sub_epi16(s3, p3);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 32) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 16) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 =
+ _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 =
+ _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else if (cols == 8) {
+ int j = rows;
+ do {
+ const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storeu_si128((__m128i *)diff_ptr, d0);
+ _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else {
+ int j = rows;
+ assert(cols == 4);
+ do {
+ const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storel_epi64((__m128i *)diff_ptr, d0);
+ _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 14f3b35c0..df6514b2c 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
} else {
// Generic case
int r = size;
- const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
__m128i v_acc_q = _mm_setzero_si128();
assert(size % 8 == 0);
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 9232acbfb..35925d590 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
return sum;
}
-static unsigned int sub_pixel_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
NULL, 0, 0, height, sse);
}
-static unsigned int sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
- int second_stride, int height, unsigned int *sse) {
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int height,
+ unsigned int *sse) {
return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
second_pred, second_stride, 1, height, sse);
}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index a67c92aad..d6eb12da1 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -19,7 +19,7 @@
static INLINE unsigned int add32x4_sse2(__m128i val) {
val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
- return _mm_cvtsi128_si32(val);
+ return (unsigned int)_mm_cvtsi128_si32(val);
}
unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
@@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_unpacklo_epi16(vsum, vsum);
vsum = _mm_srai_epi32(vsum, 16);
- *sum = add32x4_sse2(vsum);
+ *sum = (int)add32x4_sse2(vsum);
}
static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
@@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
// Can handle 1024 pixels' diff sum (such as 32x32)
static INLINE int sum_final_sse2(const __m128i sum) {
const __m128i t = sum_to_32bit_sse2(sum);
- return add32x4_sse2(t);
+ return (int)add32x4_sse2(t);
}
static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
@@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
}
@@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
}
@@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
}
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 0cbd151dc..21a35ae3c 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
// Saturate and convert to 8-bit words
dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
src_ptr += src_stride;
dst_ptr += dst_stride;
@@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
// Save only half of the register (8 words)
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
- *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
// Update the source by two rows
src_ptr += src_stride_unrolled;
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 6f2983a4b..c7d880860 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
}
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
for (i = output_height; i > 1; i -= 2) {
__m256i srcRegHead2, srcRegHead3;
@@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
s2[2] = s2[3];
srcRegHead1 = srcRegHead3;
}
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- // load the last 16 bytes
- const __m128i srcRegHead2 =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- s1[0] = _mm256_castsi128_si256(
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
- s2[0] = _mm256_castsi128_si256(
- _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
- outReg1 = convolve8_8_avx2(s1, f);
- outReg2 = convolve8_8_avx2(s2, f);
-
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
- outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
- // average if necessary
- if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- }
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
- }
}
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
@@ -798,7 +772,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
// Pack to 8-bits
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
}
}
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index ed46d6245..4ea2752d3 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
// Pack to 8-bits
dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
src_ptr += src_stride;
dst_ptr += dst_stride;
@@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
reg_1 = _mm_packus_epi16(reg_1, reg_1);
// Save the result
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
- *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
// Update the source by two rows
src_ptr += src_stride_unrolled;
diff --git a/vpx_ports/compiler_attributes.h b/vpx_ports/compiler_attributes.h
index 354352016..4b468749b 100644
--- a/vpx_ports/compiler_attributes.h
+++ b/vpx_ports/compiler_attributes.h
@@ -29,13 +29,23 @@
#endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
#if defined(__clang__) && __has_attribute(no_sanitize)
+// Both of these have defined behavior and are used in certain operations or
+// optimizations thereof. There are cases where an overflow may be unintended,
+// however, so use of these attributes should be done with care.
#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
__attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+ __attribute__((no_sanitize("unsigned-shift-base")))
+#endif // __clang__ >= 12
+#endif // __clang__
#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
#endif
+#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK
+#define VPX_NO_UNSIGNED_SHIFT_CHECK
+#endif
//------------------------------------------------------------------------------
// Variable attributes.
diff --git a/vpxenc.c b/vpxenc.c
index 7eff97b13..61672acad 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -524,9 +524,12 @@ static const arg_def_t row_mt =
static const arg_def_t disable_loopfilter =
ARG_DEF(NULL, "disable-loopfilter", 1,
- "Control Loopfilter in VP9\n"
+ "Control Loopfilter in VP9:\n"
+ " "
"0: Loopfilter on for all frames (default)\n"
+ " "
"1: Loopfilter off for non reference frames\n"
+ " "
"2: Loopfilter off for all frames");
#endif
diff --git a/webmdec.h b/webmdec.h
index d8618b07d..6ae7ee16d 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -27,7 +27,7 @@ struct WebmInputContext {
const void *block;
int block_frame_index;
int video_track_index;
- uint64_t timestamp_ns;
+ int64_t timestamp_ns;
int is_key_frame;
int reached_eos;
};
diff --git a/y4menc.c b/y4menc.c
index 02b729e5b..187798127 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
const char *color;
switch (bit_depth) {
case 8:
- color = fmt == VPX_IMG_FMT_I444
- ? "C444\n"
- : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
+ color = fmt == VPX_IMG_FMT_I444 ? "C444\n"
+ : fmt == VPX_IMG_FMT_I422 ? "C422\n"
+ : "C420jpeg\n";
break;
case 9:
- color = fmt == VPX_IMG_FMT_I44416
- ? "C444p9 XYSCSS=444P9\n"
- : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
- : "C420p9 XYSCSS=420P9\n";
+ color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n"
+ : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+ : "C420p9 XYSCSS=420P9\n";
break;
case 10:
- color = fmt == VPX_IMG_FMT_I44416
- ? "C444p10 XYSCSS=444P10\n"
- : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
- : "C420p10 XYSCSS=420P10\n";
+ color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n"
+ : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+ : "C420p10 XYSCSS=420P10\n";
break;
case 12:
- color = fmt == VPX_IMG_FMT_I44416
- ? "C444p12 XYSCSS=444P12\n"
- : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
- : "C420p12 XYSCSS=420P12\n";
+ color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n"
+ : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+ : "C420p12 XYSCSS=420P12\n";
break;
case 14:
- color = fmt == VPX_IMG_FMT_I44416
- ? "C444p14 XYSCSS=444P14\n"
- : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
- : "C420p14 XYSCSS=420P14\n";
+ color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n"
+ : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+ : "C420p14 XYSCSS=420P14\n";
break;
case 16:
- color = fmt == VPX_IMG_FMT_I44416
- ? "C444p16 XYSCSS=444P16\n"
- : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
- : "C420p16 XYSCSS=420P16\n";
+ color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n"
+ : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+ : "C420p16 XYSCSS=420P16\n";
break;
default: color = NULL; assert(0);
}
diff --git a/y4minput.c b/y4minput.c
index 7d3c03a7f..745e2f1cd 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -21,12 +21,13 @@
// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
// Returns true on success.
static int file_read(void *buf, size_t size, FILE *file) {
- const int kMaxRetries = 5;
- int retry_count = 0;
- int file_error;
+ const int kMaxTries = 5;
+ int try_count = 0;
+ int file_error = 0;
size_t len = 0;
- do {
+ while (!feof(file) && len < size && try_count < kMaxTries) {
const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+ ++try_count;
len += n;
file_error = ferror(file);
if (file_error) {
@@ -39,13 +40,13 @@ static int file_read(void *buf, size_t size, FILE *file) {
return 0;
}
}
- } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+ }
if (!feof(file) && len != size) {
fprintf(stderr,
"Error reading file: %u of %u bytes read,"
- " error: %d, retries: %d, %d: %s\n",
- (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+ " error: %d, tries: %d, %d: %s\n",
+ (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
strerror(errno));
}
return len == size;