diff options
author | Sadaf Ebrahimi <sadafebrahimi@google.com> | 2023-09-07 18:14:22 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2023-09-07 18:14:22 +0000 |
commit | f0da29480964d8f73d12d9c018e0cd42d73829d6 (patch) | |
tree | 71d20d87f6b82890c1503ea80ea32a8d1b54bb20 | |
parent | 08f747f785e3032936ed9357560bc0ad39658287 (diff) | |
parent | 1b580f29c8442ceac459aa77bd1e5154f62d636b (diff) | |
download | libyuv-f0da29480964d8f73d12d9c018e0cd42d73829d6.tar.gz |
Merge changes from topic "upgrade-libyuv" into main
* changes:
Upgrade libyuv to f0921806a293e3e008e6325a51d4ea760c39d2c1
Initial repository for libyuv
49 files changed, 2722 insertions, 1044 deletions
@@ -72,6 +72,7 @@ cc_library { "source/scale_neon.cc", "source/scale_neon64.cc", "source/scale_rgb.cc", + "source/scale_rvv.cc", "source/scale_uv.cc", "source/video_common.cc", ], @@ -82,6 +83,7 @@ cc_library { "-Wno-unused-parameter", "-fexceptions", "-DHAVE_JPEG", + "-DLIBYUV_UNLIMITED_DATA", ], arch: { @@ -1,14 +0,0 @@ -# Copyright 2011 Google Inc. All Rights Reserved. -# -# Description: -# The libyuv package provides implementation yuv image conversion and -# scaling. -# -# This library is used by Talk Video and WebRTC. -# - -licenses(['notice']) # 3-clause BSD - -exports_files(['LICENSE']) - -package(default_visibility = ['//visibility:public']) @@ -36,6 +36,12 @@ config("libyuv_config") { if (libyuv_disable_rvv) { defines += [ "LIBYUV_DISABLE_RVV" ] } + if (!libyuv_use_lsx) { + defines += [ "LIBYUV_DISABLE_LSX" ] + } + if (!libyuv_use_lasx) { + defines += [ "LIBYUV_DISABLE_LASX" ] + } } # This target is built when no specific target is specified on the command line. @@ -74,6 +80,14 @@ group("libyuv") { deps += [ ":libyuv_msa" ] } + if (libyuv_use_lsx) { + deps += [ ":libyuv_lsx"] + } + + if (libyuv_use_lasx) { + deps += [ ":libyuv_lasx"] + } + if (!is_ios && !libyuv_disable_jpeg) { # Make sure that clients of libyuv link with libjpeg. This can't go in # libyuv_internal because in Windows x64 builds that will generate a clang @@ -142,6 +156,7 @@ static_library("libyuv_internal") { "source/scale_common.cc", "source/scale_gcc.cc", "source/scale_rgb.cc", + "source/scale_rvv.cc", "source/scale_uv.cc", "source/scale_win.cc", "source/video_common.cc", @@ -235,6 +250,44 @@ if (libyuv_use_msa) { } } +if (libyuv_use_lsx) { + static_library("libyuv_lsx") { + sources = [ + # LSX Source Files + "source/row_lsx.cc", + "source/rotate_lsx.cc", + "source/scale_lsx.cc", + ] + + cflags_cc = [ + "-mlsx", + "-Wno-c++11-narrowing", + ] + + deps = [ ":libyuv_internal" ] + + public_configs = [ ":libyuv_config" ] + } +} + +if (libyuv_use_lasx) { + static_library("libyuv_lasx") { + sources = [ + # LASX Source Files + "source/row_lasx.cc", + ] + + cflags_cc = [ + "-mlasx", + "-Wno-c++11-narrowing", + ] + + deps = [ ":libyuv_internal" ] + + public_configs = [ ":libyuv_config" ] + } +} + if (libyuv_include_tests) { config("libyuv_unittest_warnings_config") { if (!is_win) { diff --git a/CM_linux_packages.cmake b/CM_linux_packages.cmake index 5f676f89..a073edfa 100644 --- a/CM_linux_packages.cmake +++ b/CM_linux_packages.cmake @@ -8,7 +8,7 @@ SET ( YUV_VER_MAJOR 0 ) SET ( YUV_VER_MINOR 0 ) SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} ) SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} ) -MESSAGE ( "Building ver.: ${YUV_VERSION}" ) +MESSAGE ( VERBOSE "Building ver.: ${YUV_VERSION}" ) # is this a 32-bit or 64-bit build? IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 ) @@ -45,7 +45,7 @@ ELSE () SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" ) ENDIF () ENDIF () -MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" ) +MESSAGE ( VERBOSE "Packaging for: ${YUV_SYSTEM_NAME}" ) # define all the variables needed by CPack to create .deb and .rpm packages SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" ) @@ -5,20 +5,20 @@ gclient_gn_args = [ vars = { 'chromium_git': 'https://chromium.googlesource.com', - 'chromium_revision': 'd1501576384de23ddf8d8815ee7c95be2f708de5', - 'gn_version': 'git_revision:e3978de3e8dafb50a2b11efa784e08699a43faf8', + 'chromium_revision': 'af3d01376bec75a68f90160bfd38057d60510a2b', + 'gn_version': 'git_revision:fae280eabe5d31accc53100137459ece19a7a295', # ninja CIPD package version. # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja 'ninja_version': 'version:2@1.11.1.chromium.6', # reclient CIPD package version - 'reclient_version': 're_client_version:0.107.1.0b39c4c-gomaip', + 'reclient_version': 're_client_version:0.110.0.43ec6b1-gomaip', # Keep the Chromium default of generating location tags. 'generate_location_tags': True, # By default, download the fuchsia sdk from the public sdk directory. - 'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/gn/', - 'fuchsia_version': 'version:12.20230530.1.1', + 'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/core/', + 'fuchsia_version': 'version:13.20230714.0.1', # By default, download the fuchsia images from the fuchsia GCS bucket. 'fuchsia_images_bucket': 'fuchsia', 'checkout_fuchsia': False, @@ -31,13 +31,13 @@ vars = { deps = { 'src/build': - Var('chromium_git') + '/chromium/src/build' + '@' + 'd0c2b4cf4fdd43866e066fb6722099aa8bf4ce79', + Var('chromium_git') + '/chromium/src/build' + '@' + '860dae780c100c2d001dc6ee16625b17bc84c10f', 'src/buildtools': - Var('chromium_git') + '/chromium/src/buildtools' + '@' + 'edbefcee3d2cc45cdb0c60c2b01b673f8ba728bc', + Var('chromium_git') + '/chromium/src/buildtools' + '@' + 'ca163845c76db63454f99436f6cd2bf03739dc24', 'src/testing': - Var('chromium_git') + '/chromium/src/testing' + '@' + 'a13817e1ea0255a375d13aeb3bb2527bd528495b', + Var('chromium_git') + '/chromium/src/testing' + '@' + '184b068a94f24ddf0b4299d48062779e1fc1950e', 'src/third_party': - Var('chromium_git') + '/chromium/src/third_party' + '@' + '824e26c9fcbd00fccf6cdb712f8f127aae133042', + Var('chromium_git') + '/chromium/src/third_party' + '@' + '2dc4b18abd1003ce7b1eda509dc96f12d49a9667', 'src/buildtools/linux64': { 'packages': [ @@ -82,17 +82,19 @@ deps = { 'dep_type': 'cipd', }, - 'src/buildtools/clang_format/script': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef', + # TODO(chromium:1458042): Remove these paths, when chromium builds files + # have moved to third_party/lib*/src paths. 'src/buildtools/third_party/libc++/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'f8279b01085b800724f5c5629dc365b9f040dc53', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '84fb809dd6dae36d556dc0bb702c6cc2ce9d4b80', 'src/buildtools/third_party/libc++abi/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '5c8dbff7a4911fe1e0af0bc1628891e4187a3c90', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + 'd4760c0af99ccc9bce077960d5ddde4d66146c05', 'src/buildtools/third_party/libunwind/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'cd144ced35285edaa064a91561969e5b22c219b1', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + '6c0013015be8a2be9de4b1e54cdc9d576b1d0729', 'src/third_party/catapult': - Var('chromium_git') + '/catapult.git' + '@' + '9f3ef9c2eae9b1adabde88efe5dcc438ba76e205', + Var('chromium_git') + '/catapult.git' + '@' + 'fa05d995e152efdae488a2aeba397cd609fdbc9d', + 'src/third_party/clang-format/script': + Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef', 'src/third_party/colorama/src': Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49', 'src/third_party/cpu_features/src': { @@ -100,19 +102,29 @@ deps = { 'condition': 'checkout_android', }, 'src/third_party/depot_tools': - Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '05ab73be51774f098eb580eda6e96a49e1010b1b', + Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'd3e43dd4319ba169c0aaf44547eecf861f2fe5da', 'src/third_party/freetype/src': - Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '80a507a6b8e3d2906ad2c8ba69329bd2fb2a85ef', + Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '9e3c5d7e183c1a8d5ed8868d7d28ef18d3ec9ec8', + 'third_party/fuchsia-gn-sdk': { + 'url': Var('chromium_git') + '/chromium/src/third_party/fuchsia-gn-sdk.git' + '@' + '0d6902558d92fe3d49ba9a8f638ddea829be595b', + 'condition': 'checkout_fuchsia', + }, 'src/third_party/googletest/src': Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07', 'src/third_party/harfbuzz-ng/src': - Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '8df5cdbcda495a582e72a7e2ce35d6106401edce', + Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'db700b5670d9475cc8ed4880cc9447b232c5e432', + 'src/third_party/libc++/src': + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '84fb809dd6dae36d556dc0bb702c6cc2ce9d4b80', + 'src/third_party/libc++abi/src': + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '8d21803b9076b16d46c32e2f10da191ee758520c', + 'src/third_party/libunwind/src': + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f1c687e0aaf0d70b9a53a150e9be5cb63af9215f', 'src/third_party/libjpeg_turbo': - Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'aa4075f116e4312537d0d3e9dbd5e31096539f94', + Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '30bdb85e302ecfc52593636b2f44af438e05e784', 'src/third_party/nasm': Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce', 'src/tools': - Var('chromium_git') + '/chromium/src/tools' + '@' + '916dfffd61cbf61075c47d7b480425d7de1483fd', + Var('chromium_git') + '/chromium/src/tools' + '@' + 'a76c0dbb64c603a0d45e0c6dfae3a351b6e1adf1', # libyuv-only dependencies (not present in Chromium). 'src/third_party/gtest-parallel': @@ -139,7 +151,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/kotlin_stdlib', - 'version': 'z4_AYYz2Tw5GKikuiDLTuxxf0NJVGLkC3CVcyiIpc-gC', + 'version': 'Z1gsqhL967kFQecxKrRwXHbl-vwQjpv0l7PMUZ0EVO8C', }, ], 'condition': 'checkout_android', @@ -149,7 +161,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/kotlinc', - 'version': 'J3BAlA7yf4corBopDhlwuT9W4jR1Z9R55KD3BUTVldQC', + 'version': 'Rr02Gf2EkaeSs3EhSUHhPqDHSd1AzimrM6cRYUJCPjQC', }, ], 'condition': 'checkout_android', @@ -157,9 +169,9 @@ deps = { }, 'src/third_party/boringssl/src': - 'https://boringssl.googlesource.com/boringssl.git' + '@' + 'dd5219451c3ce26221762a15d867edf43b463bb2', + 'https://boringssl.googlesource.com/boringssl.git' + '@' + '20a06474c0b4a16779311bfe98ba69dc2402101d', 'src/base': { - 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'b4c5ce6cb1a7c90de3fdddc80ed439fe87eab443', + 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'd407b7061bce341bb6e11b539ea86c46c949ac4c', 'condition': 'checkout_android', }, 'src/third_party/bazel': { @@ -182,16 +194,22 @@ deps = { 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_ndk': { - 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '310956bd122ec2b96049f8d7398de6b717f3452e', - 'condition': 'checkout_android', + 'src/third_party/android_toolchain': { + 'packages': [ + { + 'package': 'chromium/third_party/android_toolchain/android_toolchain', + 'version': 'R_8suM8m0oHbZ1awdxGXvKEFpAOETscbfZxkkMthyk8C', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', }, 'src/third_party/androidx': { 'packages': [ { 'package': 'chromium/third_party/androidx', - 'version': 'Wr5b9WJiFAzJcmjmvQIePIxk5IgpDl62kaGY_SiLxJEC', + 'version': 'y7rF_rx56mD3FGhMiqnlbQ6HOqHJ95xUFNX1m-_a988C', }, ], 'condition': 'checkout_android', @@ -211,8 +229,8 @@ deps = { 'src/third_party/android_sdk/public': { 'packages': [ { - 'package': 'chromium/third_party/android_sdk/public/build-tools/33.0.0', - 'version': '-VRKr36Uw8L_iFqqo9nevIBgNMggND5iWxjidyjnCgsC', + 'package': 'chromium/third_party/android_sdk/public/build-tools/34.0.0', + 'version': 'YK9Rzw3fDzMHVzatNN6VlyoD_81amLZpN1AbmkdOd6AC', }, { 'package': 'chromium/third_party/android_sdk/public/emulator', @@ -224,11 +242,11 @@ deps = { }, { 'package': 'chromium/third_party/android_sdk/public/platform-tools', - 'version': 'RSI3iwryh7URLGRgJHsCvUxj092woTPnKt4pwFcJ6L8C', + 'version': 'HWVsGs2HCKgSVv41FsOcsfJbNcB0UFiNrF6Tc4yRArYC', }, { - 'package': 'chromium/third_party/android_sdk/public/platforms/android-33', - 'version': 'eo5KvW6UVor92LwZai8Zulc624BQZoCu-yn7wa1z_YcC', + 'package': 'chromium/third_party/android_sdk/public/platforms/android-34', + 'version': 'u-bhWbTME6u-DjypTgr3ZikCyeAeU6txkR9ET6Uudc8C', }, { 'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox', @@ -330,7 +348,7 @@ deps = { }, 'src/third_party/icu': { - 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'a2961dc659b4ae847a9c6120718cc2517ee57d9e', + 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'e8c3bc9ea97d4423ad0515e5f1c064f486dae8b1', }, 'src/third_party/icu4j': { 'packages': [ @@ -356,7 +374,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/jdk', - 'version': '2Of9Pe_OdO4xoAATuiLDiMVNebKTNO3WrwJGqil4RosC', + 'version': 'IivIDwNBf73mf7UwCOBceRUuDdtizMCgSOQDfUGHArsC', }, ], 'condition': 'checkout_android', @@ -411,7 +429,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/r8', - 'version': '4Oq32DG2vuDh7Frxj6tH5xyi77sVgBWpvvl4hwvZRR4C', + 'version': 'O1BBWiBTIeNUcraX8STMtQXVaCleu6SJJjWCcnfhPLkC', }, ], 'condition': 'checkout_android', @@ -424,7 +442,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/r8', - 'version': 'PwglNZFRNPkBBXdnY9NfrZFk2ULWDTRxhV9rl2kvkpUC', + 'version': 'vw5kLlW3-suSlCKSO9OQpFWpR8oDnvQ8k1RgKNUapQYC', }, ], 'condition': 'checkout_android', @@ -441,7 +459,7 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/requests/src': { - 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0', + 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'c7e0fc087ceeadb8b4c84a0953a422c474093d6d', 'condition': 'checkout_android', }, 'src/third_party/robolectric': { @@ -468,7 +486,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/turbine', - 'version': 'Foa7uRpVoKr4YoayCKc9EERkjpmGOE3DAUTWFLL7gKEC', + 'version': '2I2Nz480QsuCxpQ1lMfbigX8l5HAhX3_ykWU4TKRGo4C', }, ], 'condition': 'checkout_android', @@ -481,7 +499,7 @@ deps = { # iOS deps: 'src/ios': { - 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '241921896b64f85de9a32d461462913cbff4baeb', + 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + 'ddd58e86cf4ebdc0db60a5d0f3c323de49bb295c', 'condition': 'checkout_ios' }, @@ -1680,7 +1698,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm', - 'version': 'version:2@3.0.0.cr1', + 'version': 'version:2@3.3.0.cr1', }, ], 'condition': 'checkout_android', @@ -1691,7 +1709,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm', - 'version': 'version:2@4.5.1.cr1', + 'version': 'version:2@4.7.0.cr1', }, ], 'condition': 'checkout_android', @@ -1823,7 +1841,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy', - 'version': 'version:2@1.14.4.cr1', + 'version': 'version:2@1.14.5.cr1', }, ], 'condition': 'checkout_android', @@ -1834,7 +1852,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent', - 'version': 'version:2@1.14.4.cr1', + 'version': 'version:2@1.14.5.cr1', }, ], 'condition': 'checkout_android', @@ -2043,7 +2061,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android', - 'version': 'version:2@5.3.1.cr1', + 'version': 'version:2@5.4.0.cr1', }, ], 'condition': 'checkout_android', @@ -2054,7 +2072,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core', - 'version': 'version:2@5.3.1.cr1', + 'version': 'version:2@5.4.0.cr1', }, ], 'condition': 'checkout_android', @@ -2065,7 +2083,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass', - 'version': 'version:2@5.3.1.cr1', + 'version': 'version:2@5.4.0.cr1', }, ], 'condition': 'checkout_android', @@ -1,14 +1,19 @@ -name: "libyuv" -description: - "libyuv is an open source project that includes YUV scaling and conversion " - "functionality." +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update libyuv +# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md +name: "libyuv" +description: "libyuv is an open source project that includes YUV scaling and conversion functionality." third_party { url { type: GIT value: "https://chromium.googlesource.com/libyuv/libyuv/" } - version: "2a6cb7431939faba1b40d3f08883847f0cf63572" - last_upgrade_date { year: 2023 month: 6 day: 1 } + version: "f0921806a293e3e008e6325a51d4ea760c39d2c1" license_type: NOTICE + last_upgrade_date { + year: 2023 + month: 9 + day: 5 + } } @@ -1 +1,11 @@ -include platform/system/core:/janitors/OWNERS
\ No newline at end of file +mbonadei@chromium.org +fbarchard@chromium.org +magjed@chromium.org +wtc@google.com +jansson@google.com + +per-file *.gn=mbonadei@chromium.org,jansson@google.com +per-file .gitignore=* +per-file AUTHORS=* +per-file DEPS=* +per-file PRESUBMIT.py=mbonadei@chromium.org,jansson@google.com diff --git a/OWNERS.android b/OWNERS.android new file mode 100644 index 00000000..7529cb92 --- /dev/null +++ b/OWNERS.android @@ -0,0 +1 @@ +include platform/system/core:/janitors/OWNERS diff --git a/README.chromium b/README.chromium index 880191e4..c68be174 100644 --- a/README.chromium +++ b/README.chromium @@ -1,8 +1,9 @@ Name: libyuv -URL: http://code.google.com/p/libyuv/ -Version: 1871 +URL: https://chromium.googlesource.com/libyuv/libyuv/ +Version: 1875 License: BSD License File: LICENSE +Shipped: yes Description: libyuv is an open source project that includes YUV conversion and scaling functionality. diff --git a/README.version b/README.version deleted file mode 100644 index 6eb9dc8c..00000000 --- a/README.version +++ /dev/null @@ -1,8 +0,0 @@ -Version: r1871 -BugComponent: 42195 -Owner: lajos -Local Modifications: - * Remove files/Android.mk (it messes with the android build system). - * Remove OWNERS files within files/ and all the subdirectories (except for - files/fuzz). Having these files breaks repo presubmit hooks since they - contain non @google.com email addresses. diff --git a/UPDATING b/UPDATING deleted file mode 100644 index 2679284c..00000000 --- a/UPDATING +++ /dev/null @@ -1,36 +0,0 @@ -To sync the libyuv checkout to an upstream revision, do the following: - -These commands are known to work from the external/libyuv directory of the -Android tree's checkout. - -Step 1: Remove the files/ subdirectory. - -$ rm -rf files - -Step 2: Clone the libyuv repository from upstream. - -$ git clone https://chromium.googlesource.com/libyuv/libyuv files - -Step 3 (optional): Checkout a specific commit/tag. - -$ cd files -$ git checkout <commit_or_tag> -$ cd .. - -Step 4: Remove files that aren't necessary (Android.mk, .git and OWNERS). - -$ rm files/Android.mk -$ rm -rf files/.git -$ find files/ -name "OWNERS" | xargs rm - -Step 5: Update the version and last_upgrade_date fields in the METADATA file. - -Step 6: Update README.version with the version (can be found in - files/include/libyuv/version.h) - -Step 7: If any local modifications are being done, update README.version and - this file with updated instructions. - -Step 8: Ensure that libyuv builds and camera and media related CTS tests are - passing. If there are any linker errors about missing symbols, try - updating frameworks/av/media/libstagefright/export.lds. diff --git a/docs/deprecated_builds.md b/docs/deprecated_builds.md index ba42966c..8edefd78 100644 --- a/docs/deprecated_builds.md +++ b/docs/deprecated_builds.md @@ -165,11 +165,11 @@ mipsel arm32 disassembly: - third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o + llvm-objdump -d out/Release/obj/source/libyuv.row_neon.o arm64 disassembly: - third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o + llvm-objdump -d out/Release/obj/source/libyuv.row_neon64.o Running tests: diff --git a/docs/getting_started.md b/docs/getting_started.md index b19f0009..f2f71b8b 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -139,11 +139,11 @@ mips arm disassembly: - third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt + llvm-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt - third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt + llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt - third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt + llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt Caveat: Disassembly may require optimize_max be disabled in BUILD.gn @@ -238,6 +238,18 @@ After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clan -DUSE_RVV=ON . cmake --build out/Release/ +#### Customized Compiler Flags + +Customized compiler flags are supported by `-DRISCV_COMPILER_FLAGS="xxx"`. +If `-DRISCV_COMPILER_FLAGS="xxx"` is manually assigned, other compile flags(e.g disable -march=xxx) will not be appended. + +Example: + + cmake -B out/Release/ -DUNIT_TEST=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \ + -DRISCV_COMPILER_FLAGS="-mcpu=sifive-x280" \ + . ### Run on QEMU diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index d8e82d72..8293c919 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -28,7 +28,10 @@ extern "C" { #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) -#if __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) #define LIBYUV_DISABLE_X86 #endif #endif diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 154f2f21..f9344721 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -30,7 +30,10 @@ extern "C" { #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) -#if __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) #define LIBYUV_DISABLE_X86 #endif #endif @@ -827,15 +830,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, int width, int height); -typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); - -// Get function to Alpha Blend ARGB pixels and store to destination. -LIBYUV_API -ARGBBlendRow GetARGBBlend(); - // Alpha Blend ARGB images and store to destination. // Source is pre-multiplied by alpha using ARGBAttenuate. // Alpha of destination is set to 255. diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 2dd8c03d..3e6a2fef 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -28,7 +28,10 @@ extern "C" { #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) -#if __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) #define LIBYUV_DISABLE_X86 #endif #endif diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5b244d77..0455b4cc 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -31,7 +31,10 @@ extern "C" { #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) -#if __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) #define LIBYUV_DISABLE_X86 #endif #endif @@ -161,7 +164,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_BLENDPLANEROW_SSSE3 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -171,9 +173,6 @@ extern "C" { #define HAS_SOBELXROW_SSE2 #define HAS_SOBELXYROW_SSE2 #define HAS_SOBELYROW_SSE2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_SSSE3 -#endif // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. @@ -241,11 +240,7 @@ extern "C" { #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 -#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_AVX2 -#endif #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER) @@ -285,14 +280,15 @@ extern "C" { #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ABGRTOYJROW_SSSE3 #define HAS_AR64TOARGBROW_SSSE3 +#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 +#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 -#define HAS_DETILEROW_SSE2 #define HAS_DETILEROW_16_SSE2 -#define HAS_DETILEROW_16_AVX +#define HAS_DETILEROW_SSE2 #define HAS_DETILESPLITUVROW_SSSE3 #define HAS_DETILETOYUY2_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 @@ -345,13 +341,16 @@ extern "C" { #define HAS_ABGRTOYJROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_AR64TOARGBROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBTOAB64ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_DETILEROW_16_AVX #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 @@ -795,19 +794,25 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#define HAS_COPYROW_RVV +#if __riscv_v_intrinsic == 11000 #define HAS_AB64TOARGBROW_RVV +#define HAS_ABGRTOYJROW_RVV +#define HAS_ABGRTOYROW_RVV #define HAS_AR64TOARGBROW_RVV #define HAS_ARGBATTENUATEROW_RVV +#define HAS_ARGBBLENDROW_RVV +#define HAS_ARGBCOPYYTOALPHAROW_RVV +#define HAS_ARGBEXTRACTALPHAROW_RVV #define HAS_ARGBTOAB64ROW_RVV #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORGB24ROW_RVV -#define HAS_ARGBTOYROW_RVV #define HAS_ARGBTOYJROW_RVV -#define HAS_ABGRTOYROW_RVV -#define HAS_ABGRTOYJROW_RVV +#define HAS_ARGBTOYMATRIXROW_RVV +#define HAS_ARGBTOYROW_RVV #define HAS_BGRATOYROW_RVV -#define HAS_COPYROW_RVV +#define HAS_BLENDPLANEROW_RVV #define HAS_I400TOARGBROW_RVV #define HAS_I422ALPHATOARGBROW_RVV #define HAS_I422TOARGBROW_RVV @@ -822,10 +827,10 @@ extern "C" { #define HAS_MERGERGBROW_RVV #define HAS_MERGEUVROW_RVV #define HAS_MERGEXRGBROW_RVV -#define HAS_SPLITARGBROW_RVV -#define HAS_SPLITRGBROW_RVV -#define HAS_SPLITUVROW_RVV -#define HAS_SPLITXRGBROW_RVV +#define HAS_NV12TOARGBROW_RVV +#define HAS_NV12TORGB24ROW_RVV +#define HAS_NV21TOARGBROW_RVV +#define HAS_NV21TORGB24ROW_RVV #define HAS_RAWTOARGBROW_RVV #define HAS_RAWTORGB24ROW_RVV #define HAS_RAWTORGBAROW_RVV @@ -834,8 +839,15 @@ extern "C" { #define HAS_RGB24TOARGBROW_RVV #define HAS_RGB24TOYJROW_RVV #define HAS_RGB24TOYROW_RVV -#define HAS_RGBATOYROW_RVV #define HAS_RGBATOYJROW_RVV +#define HAS_RGBATOYMATRIXROW_RVV +#define HAS_RGBATOYROW_RVV +#define HAS_RGBTOYMATRIXROW_RVV +#define HAS_SPLITARGBROW_RVV +#define HAS_SPLITRGBROW_RVV +#define HAS_SPLITUVROW_RVV +#define HAS_SPLITXRGBROW_RVV +#endif #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1351,6 +1363,26 @@ void UYVYToARGBRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3081,6 +3113,9 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width); void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3100,6 +3135,7 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr, void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4515,6 +4551,10 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBBlendRow_RVV(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -4541,6 +4581,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void BlendPlaneRow_RVV(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, @@ -6180,7 +6225,19 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, float* dst_ptr, float param, int width); - +// Convert FP16 Half Floats to FP32 Floats +void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 + float* dst, + int width); +// Convert a column of FP16 Half Floats to a row of FP32 Floats +void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 + int src_stride, // stride in elements + float* dst, + int width); +// Convert FP32 Floats to FP16 Half Floats +void ConvertFP32ToFP16Row_NEON(const float* src, + uint16_t* dst, // fp16 + int width); void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index a7957c3f..c015d772 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -29,7 +29,10 @@ extern "C" { #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) -#if __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) #define LIBYUV_DISABLE_X86 #endif #endif @@ -175,6 +178,36 @@ extern "C" { #define HAS_SCALEROWDOWN34_LSX #endif +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#define HAS_SCALEADDROW_RVV +#define HAS_SCALEUVROWDOWN4_RVV +#define HAS_SCALEUVROWDOWNEVEN_RVV +#if __riscv_v_intrinsic == 11000 +#define HAS_SCALEARGBROWDOWN2_RVV +#define HAS_SCALEARGBROWDOWN2BOX_RVV +#define HAS_SCALEARGBROWDOWN2LINEAR_RVV +#define HAS_SCALEARGBROWDOWNEVENBOX_RVV +#define HAS_SCALEROWDOWN2_RVV +#define HAS_SCALEROWDOWN2BOX_RVV +#define HAS_SCALEROWDOWN2LINEAR_RVV +#define HAS_SCALEROWDOWN34_0_BOX_RVV +#define HAS_SCALEROWDOWN34_1_BOX_RVV +#define HAS_SCALEROWDOWN34_RVV +#define HAS_SCALEROWDOWN38_2_BOX_RVV +#define HAS_SCALEROWDOWN38_3_BOX_RVV +#define HAS_SCALEROWDOWN38_RVV +#define HAS_SCALEROWDOWN4_RVV +#define HAS_SCALEROWDOWN4BOX_RVV +#define HAS_SCALEROWUP2_BILINEAR_RVV +#define HAS_SCALEROWUP2_LINEAR_RVV +#define HAS_SCALEUVROWDOWN2_RVV +#define HAS_SCALEUVROWDOWN2BOX_RVV +#define HAS_SCALEUVROWDOWN2LINEAR_RVV +#define HAS_SCALEUVROWUP2_BILINEAR_RVV +#define HAS_SCALEUVROWUP2_LINEAR_RVV +#endif +#endif + // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -949,6 +982,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleARGBRowDown2_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -1061,6 +1106,16 @@ void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width); +void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1143,6 +1198,18 @@ void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); +void ScaleUVRowDown2_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1203,6 +1270,16 @@ void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr, int src_stepx, uint8_t* dst_uv, int dst_width); +void ScaleUVRowDown4_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, @@ -1292,6 +1369,14 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -1744,6 +1829,61 @@ void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleRowDown2_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); + +void ScaleRowDown4_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b6623dbb..d45ef09d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1871 +#define LIBYUV_VERSION 1875 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/infra/config/OWNERS b/infra/config/OWNERS new file mode 100644 index 00000000..2c4f90a0 --- /dev/null +++ b/infra/config/OWNERS @@ -0,0 +1,3 @@ +fbarchard@chromium.org +mbonadei@chromium.org +jansson@google.com diff --git a/infra/config/cr-buildbucket.cfg b/infra/config/cr-buildbucket.cfg index be9d1d28..7415851b 100644 --- a/infra/config/cr-buildbucket.cfg +++ b/infra/config/cr-buildbucket.cfg @@ -29,11 +29,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -64,11 +59,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -99,11 +89,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -132,10 +117,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -164,10 +145,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -196,10 +173,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -230,11 +203,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -265,11 +233,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -300,11 +263,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -335,11 +293,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -370,11 +323,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -405,11 +353,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -440,11 +383,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -475,11 +413,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -510,11 +443,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -545,11 +473,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -580,11 +503,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -614,10 +532,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -647,10 +561,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -680,10 +590,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -714,11 +620,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -749,11 +650,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -784,11 +680,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -819,11 +710,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -854,11 +740,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -889,11 +770,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -924,11 +800,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -959,11 +830,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -993,10 +859,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1026,10 +888,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-trusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1114,10 +972,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1146,10 +1000,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1178,10 +1028,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1212,11 +1058,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1247,11 +1088,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1281,10 +1117,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1314,10 +1146,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1348,11 +1176,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1383,11 +1206,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1418,11 +1236,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1453,11 +1266,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1488,11 +1296,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1523,11 +1326,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1558,11 +1356,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1593,11 +1386,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1627,10 +1415,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1660,10 +1444,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1693,10 +1473,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1727,11 +1503,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": true,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1764,11 +1535,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1799,11 +1565,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1834,11 +1595,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1869,11 +1625,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1904,11 +1655,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' @@ -1939,11 +1685,6 @@ buckets { } properties: '{' - ' "$build/goma": {' - ' "enable_ats": false,' - ' "server_host": "goma.chromium.org",' - ' "use_luci_auth": true' - ' },' ' "$build/reclient": {' ' "instance": "rbe-webrtc-untrusted",' ' "metrics_project": "chromium-reclient-metrics"' diff --git a/infra/config/main.star b/infra/config/main.star index 7490a599..e83afe4f 100755 --- a/infra/config/main.star +++ b/infra/config/main.star @@ -8,24 +8,6 @@ lucicfg.check_version("1.30.9") LIBYUV_GIT = "https://chromium.googlesource.com/libyuv/libyuv" LIBYUV_GERRIT = "https://chromium-review.googlesource.com/libyuv/libyuv" -GOMA_BACKEND_RBE_PROD = { - "server_host": "goma.chromium.org", - "use_luci_auth": True, -} - -GOMA_BACKEND_RBE_ATS_PROD = { - "server_host": "goma.chromium.org", - "use_luci_auth": True, - "enable_ats": True, -} - -# Disable ATS on Windows CQ/try. -GOMA_BACKEND_RBE_NO_ATS_PROD = { - "server_host": "goma.chromium.org", - "use_luci_auth": True, - "enable_ats": False, -} - RECLIENT_CI = { "instance": "rbe-webrtc-trusted", "metrics_project": "chromium-reclient-metrics", @@ -80,7 +62,7 @@ luci.project( ], bindings = [ luci.binding( - roles = "role/swarming.taskTriggerer", # for LED tasks. + roles = "role/swarming.taskTriggerer", # for LED tasks. groups = "project-libyuv-admins", ), luci.binding( @@ -218,19 +200,6 @@ def get_os_dimensions(os): return {"os": "Ubuntu-18.04", "cores": "8", "cpu": "x86-64"} return {} -def get_os_properties(os, try_builder = False): - if os == "android": - return {"$build/goma": GOMA_BACKEND_RBE_PROD} - elif os in ("ios", "mac"): - return {"$build/goma": GOMA_BACKEND_RBE_PROD} - elif os == "win" and try_builder: - return {"$build/goma": GOMA_BACKEND_RBE_NO_ATS_PROD} - elif os == "win": - return {"$build/goma": GOMA_BACKEND_RBE_ATS_PROD} - elif os == "linux": - return {"$build/goma": GOMA_BACKEND_RBE_ATS_PROD} - return {} - def libyuv_ci_builder(name, dimensions, properties, triggered_by): return luci.builder( name = name, @@ -268,8 +237,7 @@ def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyu def ci_builder(name, os, category, short_name = None): dimensions = get_os_dimensions(os) - properties = get_os_properties(os) - properties["$build/reclient"] = RECLIENT_CI + properties = {"$build/reclient": RECLIENT_CI} dimensions["pool"] = "luci.flex.ci" properties["builder_group"] = "client.libyuv" @@ -280,8 +248,7 @@ def ci_builder(name, os, category, short_name = None): def try_builder(name, os, experiment_percentage = None): dimensions = get_os_dimensions(os) - properties = get_os_properties(os, try_builder = True) - properties["$build/reclient"] = RECLIENT_CQ + properties = {"$build/reclient": RECLIENT_CQ} dimensions["pool"] = "luci.flex.try" properties["builder_group"] = "tryserver.libyuv" diff --git a/infra/config/project.cfg b/infra/config/project.cfg index af79cfb2..3c327118 100644 --- a/infra/config/project.cfg +++ b/infra/config/project.cfg @@ -7,7 +7,7 @@ name: "libyuv" access: "group:all" lucicfg { - version: "1.39.8" + version: "1.39.14" package_dir: "." config_dir: "." entry_point: "main.star" @@ -7,6 +7,7 @@ # be found in the AUTHORS file in the root of the source tree. import("//build/config/arm.gni") +import("//build/config/loongarch64.gni") import("//build/config/mips.gni") import("//build_overrides/build.gni") @@ -21,4 +22,8 @@ declare_args() { (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa libyuv_use_mmi = (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi + libyuv_use_lsx = + (current_cpu == "loong64") && loongarch64_use_lsx + libyuv_use_lasx = + (current_cpu == "loong64") && loongarch64_use_lasx } diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake index 47dd5067..e287941f 100644 --- a/riscv_script/riscv-clang.cmake +++ b/riscv_script/riscv-clang.cmake @@ -28,17 +28,20 @@ set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump") set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy") # compile options -message(STATUS "USE_RVV: ${USE_RVV}") -message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}") -set(RISCV_COMPILER_FLAGS) -if(USE_RVV) - list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv") - if(NOT USE_AUTO_VECTORIZER) - # Disable auto-vectorizer - add_compile_options(-fno-vectorize -fno-slp-vectorize) +set(RISCV_COMPILER_FLAGS "" CACHE STRING "Compile flags") +# if user provides RISCV_COMPILER_FLAGS, appeding compile flags is avoided. +if(RISCV_COMPILER_FLAGS STREQUAL "") + message(STATUS "USE_RVV: ${USE_RVV}") + message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}") + if(USE_RVV) + list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv") + if(NOT USE_AUTO_VECTORIZER) + # Disable auto-vectorizer + add_compile_options(-fno-vectorize -fno-slp-vectorize) + endif() + else() + list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc") endif() -else() - list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc") endif() message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}") diff --git a/source/convert.cc b/source/convert.cc index b11ab1bf..b68fb1d3 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2128,6 +2128,11 @@ int ARGBToI420Alpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_LSX; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index cc6560de..f6ab0784 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3853,6 +3853,11 @@ int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV12ToARGBRow = NV12ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -3938,6 +3943,11 @@ int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV21ToARGBRow = NV21ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); @@ -4058,6 +4068,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV12ToRGB24Row = NV12ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); @@ -4119,6 +4134,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV21ToRGB24Row = NV21ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); @@ -6020,6 +6040,12 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_BILINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 4 lines temp const int row_size = (width + 31) & ~31; @@ -6151,6 +6177,11 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_LINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 2 lines temp const int row_size = (width + 31) & ~31; @@ -6276,6 +6307,12 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_BILINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 4 lines temp const int row_size = (width + 31) & ~31; @@ -6837,6 +6874,12 @@ static int I420AlphaToARGBMatrixBilinear( ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_BILINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 4 lines temp const int row_size = (width + 31) & ~31; @@ -7032,6 +7075,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_LINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 2 lines temp const int row_size = (width + 31) & ~31; @@ -7770,6 +7818,11 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif +#if defined(HAS_SCALEROWUP2_LINEAR_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV; + } +#endif // alloc 2 lines temp const int row_size = (width + 31) & ~31; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d115a2a1..f6ec0dac 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2783,37 +2783,6 @@ int RGB24Mirror(const uint8_t* src_rgb24, return 0; } -// Get a blender that optimized for the CPU and pixel count. -// As there are 6 blenders to choose from, the caller should try to use -// the same blend function for all pixels if possible. -LIBYUV_API -ARGBBlendRow GetARGBBlend() { - void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, - uint8_t* dst_argb, int width) = ARGBBlendRow_C; -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; - return ARGBBlendRow; - } -#endif -#if defined(HAS_ARGBBLENDROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBBlendRow = ARGBBlendRow_NEON; - } -#endif -#if defined(HAS_ARGBBLENDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBBlendRow = ARGBBlendRow_MSA; - } -#endif -#if defined(HAS_ARGBBLENDROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBBlendRow = ARGBBlendRow_LSX; - } -#endif - return ARGBBlendRow; -} - // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API int ARGBBlend(const uint8_t* src_argb0, @@ -2826,7 +2795,7 @@ int ARGBBlend(const uint8_t* src_argb0, int height) { int y; void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, - uint8_t* dst_argb, int width) = GetARGBBlend(); + uint8_t* dst_argb, int width) = ARGBBlendRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2843,7 +2812,31 @@ int ARGBBlend(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } - +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } +#endif +#if defined(HAS_ARGBBLENDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBBlendRow = ARGBBlendRow_LSX; + } +#endif +#if defined(HAS_ARGBBLENDROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBBlendRow = ARGBBlendRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; @@ -2903,6 +2896,11 @@ int BlendPlane(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BlendPlaneRow = BlendPlaneRow_RVV; + } +#endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); @@ -2980,6 +2978,11 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BlendPlaneRow = BlendPlaneRow_RVV; + } +#endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } @@ -3016,6 +3019,11 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_SCALEROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowDown2 = ScaleRowDown2Box_RVV; + } +#endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); @@ -5340,6 +5348,11 @@ int ARGBExtractAlpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_LSX; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); @@ -5391,6 +5404,11 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); diff --git a/source/rotate.cc b/source/rotate.cc index 8d3978c7..3678b80a 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -489,13 +489,12 @@ int RotatePlane(const uint8_t* src, return -1; } -LIBYUV_API -void TransposePlane_16(const uint16_t* src, - int src_stride, - uint16_t* dst, - int dst_stride, - int width, - int height) { +static void TransposePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { int i = height; // Work across the source in 8x8 tiles while (i >= 8) { diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index c7239010..034d53e8 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -69,6 +69,11 @@ static int ARGBTranspose(const uint8_t* src_argb, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV; + } +#endif for (i = 0; i < width; ++i) { // column of source to row of dest. ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); diff --git a/source/rotate_common.cc b/source/rotate_common.cc index 4b496d1b..e72608e9 100644 --- a/source/rotate_common.cc +++ b/source/rotate_common.cc @@ -120,37 +120,6 @@ void TransposeWx8_16_C(const uint16_t* src, } } -void TransposeUVWx8_16_C(const uint16_t* src, - int src_stride, - uint16_t* dst_a, - int dst_stride_a, - uint16_t* dst_b, - int dst_stride_b, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst_a[0] = src[0 * src_stride + 0]; - dst_b[0] = src[0 * src_stride + 1]; - dst_a[1] = src[1 * src_stride + 0]; - dst_b[1] = src[1 * src_stride + 1]; - dst_a[2] = src[2 * src_stride + 0]; - dst_b[2] = src[2 * src_stride + 1]; - dst_a[3] = src[3 * src_stride + 0]; - dst_b[3] = src[3 * src_stride + 1]; - dst_a[4] = src[4 * src_stride + 0]; - dst_b[4] = src[4 * src_stride + 1]; - dst_a[5] = src[5 * src_stride + 0]; - dst_b[5] = src[5 * src_stride + 1]; - dst_a[6] = src[6 * src_stride + 0]; - dst_b[6] = src[6 * src_stride + 1]; - dst_a[7] = src[7 * src_stride + 0]; - dst_b[7] = src[7 * src_stride + 1]; - src += 2; - dst_a += dst_stride_a; - dst_b += dst_stride_b; - } -} - void TransposeWxH_16_C(const uint16_t* src, int src_stride, uint16_t* dst, diff --git a/source/row_common.cc b/source/row_common.cc index 8be37fb5..7591c6b6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -48,7 +48,6 @@ extern "C" { defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 #define LIBYUV_RGBTOU_TRUNCATE 1 -#define LIBYUV_ATTENUATE_DUP 1 #endif #if defined(LIBYUV_BIT_EXACT) #define LIBYUV_UNATTENUATE_DUP 1 @@ -1876,9 +1875,10 @@ static __inline void YPixel(uint8_t y, int yg = yuvconstants->kYToRgb[0]; #endif uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); - *g = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); - *r = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + uint8_t b8 = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + *b = b8; + *g = b8; + *r = b8; } void I444ToARGBRow_C(const uint8_t* src_y, @@ -3369,12 +3369,7 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND -#if LIBYUV_ATTENUATE_DUP -// This code mimics the SSSE3 version for better testability. -#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 -#else -#define ATTENUATE(f, a) (f * a + 128) >> 8 -#endif +#define ATTENUATE(f, a) (f * a + 255) >> 8 // Multiply source RGB by alpha and store to destination. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index e94fd04d..d8074987 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -7441,93 +7441,106 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, - 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; -static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, + -128, -128, 14, -128, 14, -128, + 14, -128, -128, -128}; + // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "punpcklbw %%xmm6,%%xmm7 \n" + "sub %0,%1 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" + "movdqu (%0),%%xmm6 \n" + "movdqa %%xmm6,%%xmm0 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0 + "pshufb %%xmm4,%%xmm3 \n" + "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha + "pmullw %%xmm3,%%xmm1 \n" + "paddw %%xmm7,%%xmm0 \n" // + 255 + "paddw %%xmm7,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "pand %%xmm5,%%xmm6 \n" + "por %%xmm6,%%xmm0 \n" + "movdqu %%xmm0,(%0,%1) \n" + "lea 0x10(%0),%0 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenuateShuffle) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 + // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; +static const lvec8 kAttenuateShuffle_AVX2 = { + 6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14, + -128, 14, -128, -128, -128, 22, -128, 22, -128, 22, -128, + -128, -128, 30, -128, 30, -128, 30, -128, -128, -128}; + // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + "vmovdqa %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" + "vpunpcklbw %%ymm6,%%ymm7,%%ymm7 \n" "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n" "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpmullw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm1,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm6,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%0,%1,1) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenuateShuffle_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBATTENUATEROW_AVX2 diff --git a/source/row_neon.cc b/source/row_neon.cc index 4ed13638..31142a90 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1827,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +struct RgbUVConstants { + uint8_t kRGBToU[4]; + uint8_t kRGBToV[4]; +}; + // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 - // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + + "vld1.8 {d0}, [%4] \n" // load rgbuvconstants + "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient + "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient + "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient + "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient + "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -1857,15 +1865,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(rgbuvconstants) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"); } +// RGB to bt601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = 74 +// UR -0.2969 coefficient = 38 +// VB -0.1406 coefficient = 18 +// VG -0.7344 coefficient = 94 +// VR 0.875 coefficient = 112 (ignored) + +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, + {18, 94, 112, 0}}; + +// RGB to JPeg coefficients +// UB coeff 0.500 = 127 +// UG coeff -0.33126 = 84 +// UR coeff -0.16874 = 43 +// VB coeff -0.08131 = 20 +// VG coeff -0.41869 = 107 +// VR coeff 0.500 = 127 (ignored) + +static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0}, + {20, 107, 127, 0}}; + +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstants); +} + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPegUVConstants); +} + // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ @@ -2702,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; - uint16_t pad; }; // RGB to JPeg coefficients @@ -2710,11 +2755,9 @@ struct RgbConstants { // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2723,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // Add 16.5 = 0x1080 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; + 0x1080}; -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, @@ -3058,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( + "vmov.u16 q15, #0x00ff \n" // 255 for rounding up + // Attenuate 8 pixels. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. @@ -3065,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "vmull.u8 q10, d0, d3 \n" // b * a "vmull.u8 q11, d1, d3 \n" // g * a "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8 + "vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8 + "vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); + : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15"); } // Quantize 8 ARGB pixels (32 bytes). diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 74190d61..1679f87c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +struct RgbUVConstants { + uint8_t kRGBToU[4]; + uint8_t kRGBToV[4]; +}; + // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 - // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "ldr d0, [%4] \n" // load rgbuvconstants + "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient + "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient + "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient + "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient + "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient + "movi v29.16b, #0x80 \n" // 128.5 + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -2229,15 +2236,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(rgbuvconstants) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", "v27", "v28", "v29"); } +// RGB to bt601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = 74 +// UR -0.2969 coefficient = 38 +// VB -0.1406 coefficient = 18 +// VG -0.7344 coefficient = 94 +// VR 0.875 coefficient = 112 (ignored) + +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, + {18, 94, 112, 0}}; + +// RGB to JPeg coefficients +// UB coeff 0.500 = 127 +// UG coeff -0.33126 = 84 +// UR coeff -0.16874 = 43 +// VB coeff -0.08131 = 20 +// VG coeff -0.41869 = 107 +// VR coeff 0.500 = 127 (ignored) + +static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0}, + {20, 107, 127, 0}}; + +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstants); +} + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPegUVConstants); +} + #define RGBTOUV_SETUP_REG \ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ @@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; - uint16_t pad; }; -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; - // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, @@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "v17"); } +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; + void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); } @@ -3402,6 +3441,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( + "movi v7.8h, #0x00ff \n" // 255 for rounding up + // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB @@ -3410,16 +3451,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "prfm pldl1keep, [%0, 448] \n" "umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8 + "addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8 + "addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Quantize 8 ARGB pixels (32 bytes). @@ -3960,6 +4001,86 @@ void ByteToFloatRow_NEON(const uint8_t* src, : "cc", "memory", "v1", "v2", "v3"); } +// Convert FP16 Half Floats to FP32 Floats +void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 + float* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats + "subs %w2, %w2, #8 \n" // 8 floats per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtl v2.4s, v1.4h \n" // 8 floats + "fcvtl2 v3.4s, v1.8h \n" + "stp q2, q3, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +// Convert FP16 Half Floats to FP32 Floats +// Read a column and write a row +void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 + int src_stride, // stride in elements + float* dst, + int width) { + asm volatile( + "cmp %w2, #8 \n" // Is there 8 rows? + "b.lo 2f \n" + "1: \n" + "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats + "ld1 {v0.h}[1], [%0], %3 \n" + "ld1 {v0.h}[2], [%0], %3 \n" + "ld1 {v0.h}[3], [%0], %3 \n" + "ld1 {v1.h}[0], [%0], %3 \n" + "ld1 {v1.h}[1], [%0], %3 \n" + "ld1 {v1.h}[2], [%0], %3 \n" + "ld1 {v1.h}[3], [%0], %3 \n" + "subs %w2, %w2, #8 \n" // 8 rows per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtl v2.4s, v0.4h \n" // 4 floats + "fcvtl v3.4s, v1.4h \n" // 4 more floats + "stp q2, q3, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + "cmp %w2, #1 \n" // Is there 1 value? + "b.lo 3f \n" + "2: \n" + "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats + "subs %w2, %w2, #1 \n" // 1 floats per loop + "fcvtl v2.4s, v1.4h \n" // 1 floats + "str s2, [%1], #4 \n" // store 1 floats + "b.gt 2b \n" + "3: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)(src_stride * 2)) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// Convert FP32 Floats to FP16 Half Floats +void ConvertFP32ToFP16Row_NEON(const float* src, + uint16_t* dst, // fp16 + int width) { + asm volatile( + "1: \n" + "ldp q2, q3, [%0], #32 \n" // load 8 floats + "subs %w2, %w2, #8 \n" // 8 floats per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats + "fcvtn2 v1.8h, v3.4s \n" + "str q1, [%1], #16 \n" // store 8 fp16 halffloats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + float ScaleMaxSamples_NEON(const float* src, float* dst, float scale, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 27e91a3b..c875be2f 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -17,7 +17,9 @@ #include "libyuv/row.h" -#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +// This module is for clang rvv. GCC hasn't supported segment load & store. +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ + defined(__clang__) #include <assert.h> #include <riscv_vector.h> @@ -29,48 +31,48 @@ extern "C" { // Fill YUV -> RGB conversion constants into vectors // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). -#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ - { \ - asm volatile("csrwi vxrm, 0"); \ - ub = yuvconst->kUVCoeff[0]; \ - vr = yuvconst->kUVCoeff[1]; \ - ug = yuvconst->kUVCoeff[2]; \ - vg = yuvconst->kUVCoeff[3]; \ - yg = yuvconst->kRGBCoeffBias[0]; \ - bb = yuvconst->kRGBCoeffBias[1] + 32; \ - bg = yuvconst->kRGBCoeffBias[2] - 32; \ - br = yuvconst->kRGBCoeffBias[3] + 32; \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ } -// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 -#define READYUV422(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422 +#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } -// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 -#define READYUV444(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m2_t v_y; \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_u = __riscv_vle8_v_u8m2(src_u, vl); \ - v_v = __riscv_vle8_v_u8m2(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444 +#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Convert from YUV to fixed point RGB @@ -101,6 +103,45 @@ extern "C" { v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +#ifdef HAS_ARGBTOAR64ROW_RVV void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { size_t avl = (size_t)4 * width; do { @@ -116,7 +157,9 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { dst_ar64 += vl; } while (avl > 0); } +#endif +#ifdef HAS_ARGBTOAB64ROW_RVV void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { size_t avl = (size_t)width; do { @@ -138,7 +181,9 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { dst_ab64 += 4 * vl; } while (avl > 0); } +#endif +#ifdef HAS_AR64TOARGBROW_RVV void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { size_t avl = (size_t)4 * width; do { @@ -153,7 +198,9 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { dst_argb += vl; } while (avl > 0); } +#endif +#ifdef HAS_AB64TOARGBROW_RVV void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { size_t avl = (size_t)width; do { @@ -171,7 +218,9 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { dst_argb += 4 * vl; } while (avl > 0); } +#endif +#ifdef HAS_RAWTOARGBROW_RVV void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -186,7 +235,9 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_RAWTORGBAROW_RVV void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -201,7 +252,9 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_RAWTORGB24ROW_RVV void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { size_t w = (size_t)width; do { @@ -214,7 +267,9 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_ARGBTORAWROW_RVV void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { size_t w = (size_t)width; do { @@ -227,7 +282,9 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { dst_raw += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_ARGBTORGB24ROW_RVV void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { @@ -242,7 +299,9 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_RGB24TOARGBROW_RVV void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -259,24 +318,26 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_I444TOARGBROW_RVV void I444ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -288,7 +349,9 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I444ALPHATOARGBROW_RVV void I444AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -303,9 +366,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -319,7 +382,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I444TORGB24ROW_RVV void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -333,9 +398,9 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -347,24 +412,26 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_I422TOARGBROW_RVV void I422ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -376,7 +443,9 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422ALPHATOARGBROW_RVV void I422AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -391,9 +460,9 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -407,24 +476,26 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422TORGBAROW_RVV void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - size_t vl; size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); uint8_t ub, vr, ug, vg; int16_t yg, bb, bg, br; vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -436,7 +507,9 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, dst_rgba += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_I422TORGB24ROW_RVV void I422ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -450,9 +523,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, vuint8m2_t v_u, v_v; vuint8m2_t v_b, v_g, v_r; vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -464,7 +537,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_I400TOARGBROW_RVV void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -503,7 +578,9 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_J400TOARGBROW_RVV void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -518,7 +595,9 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_COPYROW_RVV void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { size_t w = (size_t)width; do { @@ -530,8 +609,125 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { dst += vl; } while (w > 0); } +#endif + +#ifdef HAS_NV12TOARGBROW_RVV +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_NV12TORGB24ROW_RVV +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif + +#ifdef HAS_NV21TOARGBROW_RVV +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_NV21TORGB24ROW_RVV +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 + +#ifdef HAS_INTERPOLATEROW_RVV void InterpolateRow_RVV(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -554,13 +750,16 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, } while (dst_w > 0); return; } + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); // Blend 50 / 50. if (y1_fraction == 128) { do { size_t vl = __riscv_vsetvl_e8m8(dst_w); vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); - // Averaging add + // Use round-to-nearest-up mode for averaging add vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); dst_w -= vl; @@ -571,15 +770,13 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, return; } // General purpose row blend. - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up(0). - asm volatile("csrwi vxrm, 0"); do { size_t vl = __riscv_vsetvl_e8m4(dst_w); vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + // Use round-to-nearest-up mode for vnclip __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); dst_w -= vl; src_ptr += vl; @@ -587,7 +784,9 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, dst_ptr += vl; } while (dst_w > 0); } +#endif +#ifdef HAS_SPLITRGBROW_RVV void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -608,7 +807,9 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, src_rgb += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_MERGERGBROW_RVV void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -628,7 +829,9 @@ void MergeRGBRow_RVV(const uint8_t* src_r, dst_rgb += vl * 3; } while (w > 0); } +#endif +#ifdef HAS_SPLITARGBROW_RVV void SplitARGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -652,7 +855,9 @@ void SplitARGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_MERGEARGBROW_RVV void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -675,7 +880,9 @@ void MergeARGBRow_RVV(const uint8_t* src_r, dst_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_SPLITXRGBROW_RVV void SplitXRGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -696,7 +903,9 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#endif +#ifdef HAS_MERGEXRGBROW_RVV void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -719,7 +928,9 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#endif +#ifdef HAS_SPLITUVROW_RVV void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -737,7 +948,9 @@ void SplitUVRow_RVV(const uint8_t* src_uv, src_uv += 2 * vl; } while (w > 0); } +#endif +#ifdef HAS_MERGEUVROW_RVV void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -755,6 +968,7 @@ void MergeUVRow_RVV(const uint8_t* src_u, dst_uv += 2 * vl; } while (w > 0); } +#endif struct RgbConstants { uint8_t kRGBToY[4]; @@ -787,7 +1001,8 @@ static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -// ARGB expects first 3 values to contain RGB and 4th value is ignored. +// ARGB expects first 3 values to contain RGB and 4th value is ignored +#ifdef HAS_ARGBTOYMATRIXROW_RVV void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -817,24 +1032,34 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_ARGBTOYROW_RVV void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_ARGBTOYJROW_RVV void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_ABGRTOYROW_RVV void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); } +#endif +#ifdef HAS_ABGRTOYJROW_RVV void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); } +#endif // RGBA expects first value to be A and ignored, then 3 values to contain RGB. +#ifdef HAS_RGBATOYMATRIXROW_RVV void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width, @@ -864,19 +1089,27 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_RGBATOYROW_RVV void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_RGBATOYJROW_RVV void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_BGRATOYROW_RVV void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); } +#endif +#ifdef HAS_RGBTOYMATRIXROW_RVV void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, uint8_t* dst_y, int width, @@ -906,51 +1139,179 @@ void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, dst_y += vl; } while (w > 0); } +#endif +#ifdef HAS_RGB24TOYJROW_RVV void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); } +#endif +#ifdef HAS_RAWTOYJROW_RVV void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); } +#endif +#ifdef HAS_RGB24TOYROW_RVV void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); } +#endif +#ifdef HAS_RAWTOYROW_RVV void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); } +#endif + +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. +// src_argb: RGB values have already been pre-multiplied by the a. +#ifdef HAS_ARGBBLENDROW_RVV +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; + vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, + src_argb, vl); + __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, + src_argb1, vl); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_BLENDPLANEROW_RVV +void BlendPlaneRow_RVV(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + size_t w = (size_t)width; + do { + vuint16m8_t v_dst_u16; + vuint8m4_t v_dst; + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl); + vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl); + + // (a * foreground) + (1-a) * background + v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl); + v_dst_u16 = + __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl); + v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl); + v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl); + + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src0 += vl; + src1 += vl; + alpha += vl; + dst += vl; + } while (w > 0); +} +#endif +// Attenuate: (f * a + 255) >> 8 +#ifdef HAS_ARGBATTENUATEROW_RVV void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width) { size_t w = (size_t)width; - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up(0). - asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_b, v_g, v_r, v_a; vuint16m4_t v_ba_16, v_ga_16, v_ra_16; size_t vl = __riscv_vsetvl_e8m2(w); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + // f * a v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); - v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl); - v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl); - v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl); + // f * a + 255 + v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); + v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); + v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); + // (f * a + 255) >> 8 + v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_argb += vl * 4; dst_argb += vl * 4; } while (w > 0); } +#endif + +#ifdef HAS_ARGBEXTRACTALPHAROW_RVV +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV +void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + const ptrdiff_t dst_stride = 4; + dst += 3; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl); + __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl); + w -= vl; + src += vl; + dst += vl * dst_stride; + } while (w > 0); +} +#endif #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && + // defined(__clang__) diff --git a/source/scale.cc b/source/scale.cc index 80b030dc..43d973af 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -135,6 +135,14 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowDown2 = filtering == kFilterNone + ? ScaleRowDown2_RVV + : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV + : ScaleRowDown2Box_RVV); + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -312,6 +320,11 @@ static void ScalePlaneDown4(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN4_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_RVV : ScaleRowDown4_RVV; + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -472,6 +485,17 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_RVV; + ScaleRowDown34_1 = ScaleRowDown34_RVV; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV; + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -687,6 +711,17 @@ static void ScalePlaneDown38(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN38_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_RVV; + ScaleRowDown38_2 = ScaleRowDown38_RVV; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV; + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -971,6 +1006,11 @@ static void ScalePlaneBox(int src_width, } } #endif +#if defined(HAS_SCALEADDROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleAddRow = ScaleAddRow_RVV; + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; @@ -1048,15 +1088,15 @@ static void ScalePlaneBox_16(int src_width, } // Scale plane down with bilinear interpolation. -void ScalePlaneBilinearDown(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { +static void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -1176,15 +1216,15 @@ void ScalePlaneBilinearDown(int src_width, free_aligned_buffer_64(row); } -void ScalePlaneBilinearDown_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { +static void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -1268,15 +1308,15 @@ void ScalePlaneBilinearDown_16(int src_width, } // Scale up down with bilinear interpolation. -void ScalePlaneBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { +static void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1425,14 +1465,14 @@ void ScalePlaneBilinearUp(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of I422 to I444. -void ScalePlaneUp2_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { +static void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_Any_C; int i; @@ -1465,6 +1505,11 @@ void ScalePlaneUp2_Linear(int src_width, ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; } #endif +#ifdef HAS_SCALEROWUP2_LINEAR_RVV + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowUp = ScaleRowUp2_Linear_RVV; + } +#endif if (dst_height == 1) { ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, @@ -1484,14 +1529,14 @@ void ScalePlaneUp2_Linear(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of I420 to I444. -void ScalePlaneUp2_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { +static void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_Any_C; @@ -1524,6 +1569,11 @@ void ScalePlaneUp2_Bilinear(int src_width, Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; } #endif +#ifdef HAS_SCALEROWUP2_BILINEAR_RVV + if (TestCpuFlag(kCpuHasRVV)) { + Scale2RowUp = ScaleRowUp2_Bilinear_RVV; + } +#endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; @@ -1544,14 +1594,14 @@ void ScalePlaneUp2_Bilinear(int src_width, // its original width, using linear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I210 to I410 and I212 to I412. -void ScalePlaneUp2_12_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { +static void ScalePlaneUp2_12_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_16_Any_C; int i; @@ -1598,14 +1648,14 @@ void ScalePlaneUp2_12_Linear(int src_width, // its original size, using bilinear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I010 to I410 and I012 to I412. -void ScalePlaneUp2_12_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { +static void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; @@ -1645,14 +1695,14 @@ void ScalePlaneUp2_12_Bilinear(int src_width, } } -void ScalePlaneUp2_16_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { +static void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_16_Any_C; int i; @@ -1694,14 +1744,14 @@ void ScalePlaneUp2_16_Linear(int src_width, } } -void ScalePlaneUp2_16_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { +static void ScalePlaneUp2_16_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; @@ -1741,15 +1791,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width, } } -void ScalePlaneBilinearUp_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { +static void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index ddd8d29e..1d5c1b60 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -16,6 +16,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyARGB #include "libyuv/row.h" +#include "libyuv/scale_argb.h" #include "libyuv/scale_row.h" #ifdef __cplusplus @@ -127,6 +128,15 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_RVV + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_RVV + : ScaleARGBRowDown2Box_RVV); + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -184,6 +194,11 @@ static void ScaleARGBDown4Box(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_RVV; + } +#endif for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); @@ -263,6 +278,12 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_RVV : ScaleARGBRowDownEven_RVV; + } +#endif if (filtering == kFilterLinear) { src_stride = 0; diff --git a/source/scale_common.cc b/source/scale_common.cc index 77455903..d07a39af 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1964,35 +1964,6 @@ void ScaleSlope(int src_width, } #undef CENTERSTART -// Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src2 = src_ptr + src_stride; - - int x; - for (x = 0; x < dst_width - 1; x += 2) { - uint16_t p0 = src_ptr[0]; - uint16_t p1 = src_ptr[1]; - uint16_t p2 = src2[0]; - uint16_t p3 = src2[1]; - dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; - dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; - ++src_ptr; - ++src2; - dst += 2; - } - if (dst_width & 1) { - uint16_t p0 = src_ptr[0]; - uint16_t p1 = src_ptr[1]; - uint16_t p2 = src2[0]; - uint16_t p3 = src2[1]; - dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; - } -} - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index ad06ee83..7c072380 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1118,101 +1118,6 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, #undef LOAD2_DATA8_LANE -// 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y_fraction = 256 - source_y_fraction; - asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 25 / 75. - "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 75 / 25. - "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction), // %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); -} - void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc new file mode 100644 index 00000000..fd14842d --- /dev/null +++ b/source/scale_rvv.cc @@ -0,0 +1,1038 @@ +/* + * Copyright 2023 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * Contributed by Darren Hsieh <darren.hsieh@sifive.com> + * Contributed by Bruce Lai <bruce.lai@sifive.com> + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +// This module is for clang rvv. GCC hasn't supported segment load & store. +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ + defined(__clang__) +#include <assert.h> +#include <riscv_vector.h> +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAS_SCALEADDROW_RVV +void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + size_t w = (size_t)src_width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl); + // Use widening multiply-add instead of widening + add + v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl); + __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEARGBROWDOWN2_RVV +void ScaleARGBRowDown2_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + size_t w = (size_t)dst_width; + const uint64_t* src = (const uint64_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + do { + size_t vl = __riscv_vsetvl_e64m8(w); + vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl); + vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl); + __riscv_vse32_v_u32m4(dst, v_dst, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV +void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + size_t w = (size_t)dst_width; + const uint32_t* src = (const uint32_t*)(src_argb); + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m4_t v_odd, v_even, v_dst; + vuint32m4_t v_odd_32, v_even_32; + size_t vl = __riscv_vsetvl_e32m4(w); + __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl); + v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32); + v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32); + // Use round-to-nearest-up mode for averaging add + v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src += vl * 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV +void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src0 = (const uint32_t*)(src_argb); + const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; + vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; + vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32; + size_t vl = __riscv_vsetvl_e32m4(w); + __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl); + __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl); + v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32); + v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32); + v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32); + v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32); + v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4); + v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4); + v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); + // Use round-to-nearest-up mode for vnclip + v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src0 += vl * 2; + src1 += vl * 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + const int stride_byte = src_stepx * 4; + do { + size_t vl = __riscv_vsetvl_e32m8(w); + vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl); + __riscv_vse32_v_u32m8(dst, v_row, vl); + w -= vl; + src += vl * src_stepx; + dst += vl; + } while (w > 0); +} + +#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV +void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src0 = (const uint32_t*)(src_argb); + const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); + const int stride_byte = src_stepx * 4; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; + vuint16m8_t v_row0_sum, v_row1_sum, v_sum; + vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32; + size_t vl = __riscv_vsetvl_e32m4(w); + __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0, + stride_byte, vl); + __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1, + stride_byte, vl); + v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32); + v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32); + v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32); + v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32); + v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4); + v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4); + v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); + // Use round-to-nearest-up mode for vnclip + v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src0 += vl * src_stepx; + src1 += vl * src_stepx; + dst_argb += vl * 4; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN2_RVV +void ScaleRowDown2_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + size_t w = (size_t)dst_width; + const uint16_t* src = (const uint16_t*)src_ptr; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e16m8(w); + vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl); + vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl); + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN2LINEAR_RVV +void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + size_t w = (size_t)dst_width; + (void)src_stride; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m4_t v_s0, v_s1, v_dst; + size_t vl = __riscv_vsetvl_e8m4(w); + __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl); + // Use round-to-nearest-up mode for averaging add + v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl); + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src_ptr += 2 * vl; + dst += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN2BOX_RVV +void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + size_t w = (size_t)dst_width; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_s0, v_s1, v_t0, v_t1; + vuint16m8_t v_s01, v_t01, v_st01; + vuint8m4_t v_dst; + __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl); + __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl); + v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl); + v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl); + v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl); + // Use round-to-nearest-up mode for vnclip + v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl); + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + s += 2 * vl; + t += 2 * vl; + dst += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN4_RVV +void ScaleRowDown4_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_s0, v_s1, v_s2, v_s3; + __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); + __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl); + w -= vl; + src_ptr += (4 * vl); + dst_ptr += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN4BOX_RVV +void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + size_t w = (size_t)dst_width; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_s0, v_s1, v_s2, v_s3; + vuint8m2_t v_t0, v_t1, v_t2, v_t3; + vuint8m2_t v_u0, v_u1, v_u2, v_u3; + vuint8m2_t v_v0, v_v1, v_v2, v_v3; + vuint16m4_t v_s01, v_s23, v_t01, v_t23; + vuint16m4_t v_u01, v_u23, v_v01, v_v23; + vuint16m4_t v_st01, v_st23, v_uv01, v_uv23; + vuint16m4_t v_st0123, v_uv0123, v_stuv0123; + vuint8m2_t v_dst; + size_t vl = __riscv_vsetvl_e8m2(w); + + __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); + v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl); + + __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl); + v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl); + + __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl); + v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl); + v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl); + + v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl); + v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl); + v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl); + v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl); + + __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl); + + v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl); + v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl); + + v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl); + v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl); + + v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl); + v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl); + v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl); + // Use round-to-nearest-up mode for vnclip + v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl); + __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 4 * vl; + src_ptr1 += 4 * vl; + src_ptr2 += 4 * vl; + src_ptr3 += 4 * vl; + dst_ptr += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN34_RVV +void ScaleRowDown34_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_s0, v_s1, v_s2, v_s3; + __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); + __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl); + w -= vl; + src_ptr += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV +void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_s0, v_s1, v_s2, v_s3; + vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; + vuint8m2_t v_u0, v_u1, v_u2, v_u3; + vuint16m4_t v_u1_u16; + vuint8m2_t v_a0, v_a1, v_a2; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl); + + if (src_stride == 0) { + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl); + v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl); + } else { + vuint8m2_t v_t0, v_t1, v_t2, v_t3; + __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl); + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl); + v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl); + v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl); + t += 4 * vl; + } + + v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl); + v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl); + v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl); + v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl); + + // Use round-to-nearest-up mode for vnclip & averaging add + v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl); + v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl); + v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl); + v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl); + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl); + v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl); + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl); + v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); + + __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl); + + w -= vl; + s += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV +void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_s0, v_s1, v_s2, v_s3; + vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; + vuint16m4_t v_u1_u16; + vuint8m2_t v_a0, v_a1, v_a2; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl); + + // Use round-to-nearest-up mode for vnclip & averaging add + if (src_stride == 0) { + v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl); + v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl); + v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl); + v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl); + } else { + vuint8m2_t v_t0, v_t1, v_t2, v_t3; + __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl); + v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl); + v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl); + v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl); + v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl); + t += 4 * vl; + } + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl); + v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl); + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl); + v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); + + __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl); + + w -= vl; + s += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN38_RVV +void ScaleRowDown38_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + (void)src_stride; + assert(dst_width % 3 == 0); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + size_t vl = __riscv_vsetvl_e8m1(w); + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV +void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 6u); + const uint16_t coeff_b = (65536u / 4u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; + vuint16m2_t v_e0, v_e1, v_e2, v_e; + vuint16m2_t v_f0, v_f1, v_f2, v_f; + vuint16m2_t v_g0, v_g1, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + // t: e01, e11, e21, f01, f11, f21, g01, g11 + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, + &v_t7, src_ptr + src_stride, vl); + // Calculate sum of [e00, e21] to v_e + // Calculate sum of [f00, f21] to v_f + // Calculate sum of [g00, g11] to v_g + v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV +void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 9u); + const uint16_t coeff_b = (65536u / 6u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; + vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; + vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7; + vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e; + vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f; + vuint16m2_t v_g0, v_g1, v_g2, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + // t: e01, e11, e21, f01, f11, f21, g01, g11 + // u: e02, e12, e22, f02, f12, f22, g02, g12 + __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, + &v_s7, src_ptr, vl); + __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, + &v_t7, src_ptr + src_stride, vl); + __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6, + &v_u7, src_ptr + 2 * src_stride, vl); + // Calculate sum of [e00, e22] + v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl); + v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl); + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + // Calculate sum of [f00, f22] + v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl); + v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl); + + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + // Calculate sum of [g00, g12] + v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl); + + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#endif + +// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' +// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other +// platforms only implement non-edge part of image and process edge with scalar. + +#ifdef HAS_SCALEROWUP2_LINEAR_RVV +void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + size_t work_width = (size_t)dst_width - 1u; + size_t src_width = work_width >> 1u; + const uint8_t* work_src_ptr = src_ptr; + uint8_t* work_dst_ptr = dst_ptr + 1; + size_t vl = __riscv_vsetvlmax_e8m4(); + vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl); + dst_ptr[0] = src_ptr[0]; + while (src_width > 0) { + vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even; + vuint16m8_t v_src0_u16, v_src1_u16; + size_t vl = __riscv_vsetvl_e8m4(src_width); + v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); + v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl); + + v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl); + v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl); + v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl); + v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl); + + v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl); + v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl); + + __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl); + + src_width -= vl; + work_src_ptr += vl; + work_dst_ptr += 2 * vl; + } + dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_RVV +void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + size_t src_width = work_width >> 1u; + const uint8_t* work_s = src_ptr; + const uint8_t* work_t = src_ptr + src_stride; + const uint8_t* s = work_s; + const uint8_t* t = work_t; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + uint8_t* work_d = d + 1; + uint8_t* work_e = e + 1; + size_t vl = __riscv_vsetvlmax_e16m4(); + vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); + vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); + d[0] = (3 * s[0] + t[0] + 2) >> 2; + e[0] = (s[0] + 3 * t[0] + 2) >> 2; + while (src_width > 0) { + vuint8m2_t v_s0, v_s1, v_t0, v_t1; + vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; + vuint16m4_t v_t0_u16_, v_t1_u16_; + vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; + size_t vl = __riscv_vsetvl_e8m2(src_width); + v_s0 = __riscv_vle8_v_u8m2(work_s, vl); + v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl); + + v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); + v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); + + v_t0 = __riscv_vle8_v_u8m2(work_t, vl); + v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl); + + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); + v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); + v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); + + v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); + v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); + + v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); + v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); + v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); + v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); + + v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); + v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); + v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); + v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); + + __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl); + __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl); + + src_width -= vl; + work_s += vl; + work_t += vl; + work_d += 2 * vl; + work_e += 2 * vl; + } + d[dst_width - 1] = + (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2; + e[dst_width - 1] = + (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2; +} +#endif + +#ifdef HAS_SCALEUVROWDOWN2_RVV +void ScaleUVRowDown2_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src = (const uint32_t*)src_uv; + uint16_t* dst = (uint16_t*)dst_uv; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e32m8(w); + vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl); + vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl); + __riscv_vse16_v_u16m4(dst, v_u1v1, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV +void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + size_t w = (size_t)dst_width; + const uint16_t* src = (const uint16_t*)src_uv; + (void)src_stride; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m4_t v_u0v0, v_u1v1, v_avg; + vuint16m4_t v_u0v0_16, v_u1v1_16; + size_t vl = __riscv_vsetvl_e16m4(w); + __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl); + v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16); + v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16); + // Use round-to-nearest-up mode for averaging add + v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2); + __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2); + w -= vl; + src += vl * 2; + dst_uv += vl * 2; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEUVROWDOWN2BOX_RVV +void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + const uint8_t* src_uv_row1 = src_uv + src_stride; + size_t w = (size_t)dst_width; + // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; + vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1; + vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1; + vuint16m4_t v_sum0, v_sum1; + vuint8m2_t v_dst_u, v_dst_v; + size_t vl = __riscv_vsetvl_e8m2(w); + + __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0, + src_uv, vl); + __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1, + src_uv_row1, vl); + + v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl); + v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl); + v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl); + v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl); + + v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl); + v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl); + // Use round-to-nearest-up mode for vnclip + v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl); + v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl); + + __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl); + + dst_uv += 2 * vl; + src_uv += 4 * vl; + w -= vl; + src_uv_row1 += 4 * vl; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEUVROWDOWN4_RVV +void ScaleUVRowDown4_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2. + // dst_width = src_width / 4 and src_width is also int. + size_t w = (size_t)dst_width * 8; + (void)src_stride; + (void)src_stepx; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl); + vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row); + // Narrowing without clipping + vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8); + vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8); + vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16); + __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4); + w -= vl; + src_uv += vl; + dst_uv += vl / 4; + } while (w > 0); +} +#endif + +#ifdef HAS_SCALEUVROWDOWNEVEN_RVV +void ScaleUVRowDownEven_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + size_t w = (size_t)dst_width; + const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2; + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e16m8(w); + vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl); + __riscv_vse16_v_u16m8(dst, v_row, vl); + w -= vl; + src += vl * src_stepx; + dst += vl; + } while (w > 0); +} +#endif + +// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' +// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. +// Other platforms only implement non-edge part of image and process edge with +// scalar. + +#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV +void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1; + const uint8_t* work_src_ptr = src_ptr; + size_t vl = __riscv_vsetvlmax_e8m4(); + vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl); + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + while (work_width > 0) { + vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8; + vuint16m4_t v_dst_odd, v_dst_even; + vuint16m8_t v_uv0_u16, v_uv1_u16; + size_t vl = __riscv_vsetvl_e8m4(work_width); + v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); + v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl); + + v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl); + v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl); + + v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl); + v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl); + + v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl); + v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl); + + v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8); + v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8); + + __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2); + + work_width -= vl; + work_src_ptr += vl; + work_dst_ptr += vl; + } + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV +void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + const uint8_t* work_s = src_ptr; + const uint8_t* work_t = src_ptr + src_stride; + const uint8_t* s = work_s; + const uint8_t* t = work_t; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + uint16_t* work_d = (uint16_t*)d + 1; + uint16_t* work_e = (uint16_t*)e + 1; + size_t vl = __riscv_vsetvlmax_e16m4(); + vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); + vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); + d[0] = (3 * s[0] + t[0] + 2) >> 2; + e[0] = (s[0] + 3 * t[0] + 2) >> 2; + d[1] = (3 * s[1] + t[1] + 2) >> 2; + e[1] = (s[1] + 3 * t[1] + 2) >> 2; + while (work_width > 0) { + vuint8m2_t v_s0, v_s1, v_t0, v_t1; + vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; + vuint16m4_t v_t0_u16_, v_t1_u16_; + vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8; + vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; + size_t vl = __riscv_vsetvl_e8m2(work_width); + v_s0 = __riscv_vle8_v_u8m2(work_s, vl); + v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl); + + v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); + v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); + + v_t0 = __riscv_vle8_v_u8m2(work_t, vl); + v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl); + + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); + v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); + v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); + + v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); + v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); + + v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); + v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); + v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); + v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); + + v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); + v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); + v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); + v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); + + v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8); + v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8); + v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8); + v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8); + + __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2); + __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2); + + work_width -= vl; + work_s += vl; + work_t += vl; + work_d += vl; + work_e += vl; + } + d[2 * dst_width - 2] = + (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >> + 2; + e[2 * dst_width - 2] = + (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >> + 2; + d[2 * dst_width - 1] = + (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >> + 2; + e[2 * dst_width - 1] = + (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >> + 2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && + // defined(__clang__) diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 1556071d..536b9436 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -128,6 +128,15 @@ static void ScaleUVDown2(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_RVV + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_RVV + : ScaleUVRowDown2Box_RVV); + } +#endif // This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_SSSE3) @@ -231,6 +240,11 @@ static void ScaleUVDown4Box(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2BOX_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_RVV; + } +#endif for (j = 0; j < dst_height; ++j) { ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); @@ -310,6 +324,12 @@ static void ScaleUVDownEven(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) + if (TestCpuFlag(kCpuHasRVV) && !filtering) { + ScaleUVRowDownEven = + (col_step == 4) ? ScaleUVRowDown4_RVV : ScaleUVRowDownEven_RVV; + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -637,14 +657,14 @@ static void ScaleUVBilinearUp(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of NV16 to NV24. -void ScaleUVLinearUp2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv) { +static void ScaleUVLinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv) { void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = ScaleUVRowUp2_Linear_Any_C; int i; @@ -672,6 +692,12 @@ void ScaleUVLinearUp2(int src_width, } #endif +#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowUp = ScaleUVRowUp2_Linear_RVV; + } +#endif + if (dst_height == 1) { ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv, dst_width); @@ -690,14 +716,14 @@ void ScaleUVLinearUp2(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of NV12 to NV24. -void ScaleUVBilinearUp2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { +static void ScaleUVBilinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_Any_C; @@ -725,6 +751,12 @@ void ScaleUVBilinearUp2(int src_width, } #endif +#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV + if (TestCpuFlag(kCpuHasRVV)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_RVV; + } +#endif + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { @@ -744,14 +776,14 @@ void ScaleUVBilinearUp2(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of P210 to P410. -void ScaleUVLinearUp2_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_uv, - uint16_t* dst_uv) { +static void ScaleUVLinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_uv, + uint16_t* dst_uv) { void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; int i; @@ -797,14 +829,14 @@ void ScaleUVLinearUp2_16(int src_width, // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of P010 to P410. -void ScaleUVBilinearUp2_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { +static void ScaleUVBilinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; diff --git a/tools_libyuv/OWNERS b/tools_libyuv/OWNERS new file mode 100644 index 00000000..aae4fb6e --- /dev/null +++ b/tools_libyuv/OWNERS @@ -0,0 +1,4 @@ +mbonadei@chromium.org +fbarchard@chromium.org +pbos@chromium.org + diff --git a/tools_libyuv/autoroller/roll_deps.py b/tools_libyuv/autoroller/roll_deps.py index 2b57eb65..d5c1089f 100755 --- a/tools_libyuv/autoroller/roll_deps.py +++ b/tools_libyuv/autoroller/roll_deps.py @@ -31,6 +31,7 @@ def FindSrcDirPath(): # Skip these dependencies (list without solution name prefix). DONT_AUTOROLL_THESE = [ + 'third_party/fuchsia-gn-sdk', 'src/third_party/gflags/src', 'src/third_party/mockito/src', ] diff --git a/tools_libyuv/msan/OWNERS b/tools_libyuv/msan/OWNERS new file mode 100644 index 00000000..9b67a8f6 --- /dev/null +++ b/tools_libyuv/msan/OWNERS @@ -0,0 +1,3 @@ +mbonadei@chromium.org +fbarchard@chromium.org +pbos@chromium.org diff --git a/tools_libyuv/ubsan/OWNERS b/tools_libyuv/ubsan/OWNERS new file mode 100644 index 00000000..9b67a8f6 --- /dev/null +++ b/tools_libyuv/ubsan/OWNERS @@ -0,0 +1,3 @@ +mbonadei@chromium.org +fbarchard@chromium.org +pbos@chromium.org diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 93867fa7..431343e3 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -137,6 +137,9 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) { #ifdef __riscv_vector printf("__riscv_vector %d\n", __riscv_vector); #endif +#ifdef __riscv_v_intrinsic + printf("__riscv_v_intrinsic %d\n", __riscv_v_intrinsic); +#endif #ifdef __APPLE__ printf("__APPLE__ %d\n", __APPLE__); #endif diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index ad97b87e..ec1d72eb 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -30,9 +30,9 @@ #endif #if defined(LIBYUV_BIT_EXACT) -#define EXPECTED_ATTENUATE_DIFF 0 +#define EXPECTED_UNATTENUATE_DIFF 0 #else -#define EXPECTED_ATTENUATE_DIFF 2 +#define EXPECTED_UNATTENUATE_DIFF 2 #endif namespace libyuv { @@ -57,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { orig_pixels[2 * 4 + 0] = 16u; orig_pixels[2 * 4 + 1] = 64u; orig_pixels[2 * 4 + 2] = 192u; - orig_pixels[2 * 4 + 3] = 255u; + orig_pixels[2 * 4 + 3] = 128u; orig_pixels[3 * 4 + 0] = 16u; orig_pixels[3 * 4 + 1] = 64u; orig_pixels[3 * 4 + 2] = 192u; - orig_pixels[3 * 4 + 3] = 128u; - ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1); + orig_pixels[3 * 4 + 3] = 255u; + orig_pixels[4 * 4 + 0] = 255u; + orig_pixels[4 * 4 + 1] = 255u; + orig_pixels[4 * 4 + 2] = 255u; + orig_pixels[4 * 4 + 3] = 255u; + + ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]); EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]); @@ -71,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]); - EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]); - EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]); - EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]); - EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]); - EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]); - EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]); - EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]); - EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]); + EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]); + EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]); + EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]); + EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]); + EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]); + EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]); + EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]); + EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]); + + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1); + EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]); + EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]); + EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]); + EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]); + EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]); + EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]); + EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]); + EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]); + EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]); + EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]); + EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]); + + // test 255 + for (int i = 0; i < 256; ++i) { + orig_pixels[i * 4 + 0] = i; + orig_pixels[i * 4 + 1] = 0; + orig_pixels[i * 4 + 2] = 0; + orig_pixels[i * 4 + 3] = 255; + } + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1); + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]); + EXPECT_EQ(0, atten_pixels[i * 4 + 1]); + EXPECT_EQ(0, atten_pixels[i * 4 + 2]); + EXPECT_EQ(255, atten_pixels[i * 4 + 3]); + } for (int i = 0; i < 1280; ++i) { orig_pixels[i * 4 + 0] = i; @@ -92,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1); } for (int i = 0; i < 1280; ++i) { - EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2); + EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1); } // Make sure transparent, 50% and opaque are fully accurate. EXPECT_EQ(0, atten_pixels[0 * 4 + 0]); @@ -106,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); - EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF); - EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF); - EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(255, atten_pixels[255 * 4 + 0]); + EXPECT_EQ(127, atten_pixels[255 * 4 + 1]); + EXPECT_EQ(85, atten_pixels[255 * 4 + 2]); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); @@ -165,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } static int TestUnattenuateI(int width, @@ -238,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) { @@ -2749,12 +2795,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { MaskCpuFlags(disable_cpu_flags_); ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, benchmark_width_, benchmark_width_, benchmark_height_); - MaskCpuFlags(benchmark_cpu_info_); + double c_time = get_time(); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_, benchmark_width_, benchmark_height_); + c_time = (get_time() - c_time); + MaskCpuFlags(benchmark_cpu_info_); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_, benchmark_width_, benchmark_height_); + double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, benchmark_width_, benchmark_width_, benchmark_height_); } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + // Report performance of C vs OPT + printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6), + static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -2777,12 +2834,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { MaskCpuFlags(disable_cpu_flags_); ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); - MaskCpuFlags(benchmark_cpu_info_); + double c_time = get_time(); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + c_time = (get_time() - c_time); + MaskCpuFlags(benchmark_cpu_info_); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + + // Report performance of C vs OPT + printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6), + static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -4468,4 +4537,83 @@ TEST_F(LibYUVPlanarTest, NV21Copy) { free_aligned_buffer_page_end(dst_vu); } +#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \ + defined(__aarch64__) + +TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) { + int i, j; + const int y_plane_size = benchmark_width_ * benchmark_height_; + + align_buffer_page_end(orig_f, y_plane_size * 4); + align_buffer_page_end(orig_y, y_plane_size * 2); + align_buffer_page_end(dst_opt, y_plane_size * 4); + align_buffer_page_end(rec_opt, y_plane_size * 2); + + for (i = 0; i < y_plane_size; ++i) { + ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f; + } + memset(orig_y, 1, y_plane_size * 2); + memset(dst_opt, 2, y_plane_size * 4); + memset(rec_opt, 3, y_plane_size * 2); + + ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y, + y_plane_size); + + for (j = 0; j < benchmark_iterations_; j++) { + ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt, + y_plane_size); + } + + ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt, + y_plane_size); + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]); + } + + free_aligned_buffer_page_end(orig_f); + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(rec_opt); +} + +TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) { + int i, j; + const int y_plane_size = benchmark_width_ * benchmark_height_; + + align_buffer_page_end(orig_f, y_plane_size * 4); + align_buffer_page_end(orig_y, y_plane_size * 2); + align_buffer_page_end(dst_opt, y_plane_size * 4); + align_buffer_page_end(rec_opt, y_plane_size * 2); + + for (i = 0; i < y_plane_size; ++i) { + ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f; + } + memset(orig_y, 1, y_plane_size * 2); + memset(dst_opt, 2, y_plane_size * 4); + memset(rec_opt, 3, y_plane_size * 2); + + ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y, + y_plane_size); + + for (j = 0; j < benchmark_iterations_; j++) { + ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt, + y_plane_size); + } + + ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt, + y_plane_size); + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]); + } + + free_aligned_buffer_page_end(orig_f); + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(rec_opt); +} + +#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__) + } // namespace libyuv diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index a8c95268..c2232e66 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -1217,48 +1217,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { } #endif // HAS_SCALEROWDOWN2_SSSE3 -extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); - -TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { - SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun. - SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); - SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); - - memset(orig_pixels, 0, sizeof(orig_pixels)); - memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt)); - memset(dst_pixels_c, 2, sizeof(dst_pixels_c)); - - for (int i = 0; i < 640 * 2 + 1; ++i) { - orig_pixels[i] = i; - } - ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); - for (int i = 0; i < benchmark_pixels_div1280_; ++i) { -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - int has_neon = TestCpuFlag(kCpuHasNEON); - if (has_neon) { - ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); - } else { - ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); - } -#else - ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); -#endif - } - - for (int i = 0; i < 1280; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); - } - EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16); - EXPECT_EQ(dst_pixels_c[1279], 800); -} - extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, |